In [None]:
import pickle
import os
from dotenv import load_dotenv
from openai import AsyncOpenAI
from supabase import create_client, Client

# Load the podcast data from the pickle file
with open('podcast_data.pkl', 'rb') as file:
    podcast_data = pickle.load(file)

# Load environment variables from the .env file
load_dotenv()

# Initialize OpenAI client
openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Initialize Supabase client
supabase: Client = create_client(
    os.getenv("SUPABASE_URL"),
    os.getenv("SUPABASE_SERVICE_KEY")
)


In [None]:
episode_number_to_match = ['042', '043', '132', '133', '200', '201']

# Create a subset of the podcast data where the episode number matches the specified values
subset_podcast_data = podcast_data[podcast_data['episode number'].isin(episode_number_to_match)]


In [None]:
import os
import openai
from dotenv import load_dotenv
import pandas as pd

# Load environment variables from .env file
openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_summary(transcript: str, existing_summary: str) -> str:
    """
    Uses GPT-4o-mini to summarize the key takeaway points from both the transcript
    and the existing summary. Returns a concise bullet-point list.
    """
    prompt_content = (
        "Below is the transcript and an existing summary for a podcast episode. "
        "Summarize the key takeaway points in bullet-point form, with each bullet representing a distinct key point. You can use multiple sentences if you need to. "
        "Make sure to include all the important details and main ideas. "
        "Do not include any additional commentary.\n\n"
        f"Transcript:\n{transcript}\n\n"
        f"Existing Summary:\n{existing_summary}"
    )
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a helpful assistant that creates clear, concise summaries of podcasts. "
                    "You think from the perspective of a listener who wants to quickly grasp the main points. "
                    "Your audience is data scientists and machine learning engineers who are listening to try to expand their skillsets and learn from the experiences of the podcasters."
                )
            },
            {
                "role": "user",
                "content": prompt_content
            }
        ],
        temperature=0.2
    )
    return response.choices[0].message.content.strip()

def add_gpt_summaries(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each row in the DataFrame, sends both the transcript and the existing summary to the model for summarization,
    then stores the result in a new 'GPT Summary' column.
    """
    summaries = []
    for _, row in df.iterrows():
        transcript = row['Transcript']
        # Use the 'Summary' column if it exists; otherwise, default to an empty string.
        existing_summary = row.get('Summary', '')
        summary_text = generate_summary(transcript, existing_summary)
        summaries.append(summary_text)
    df['GPT Summary'] = summaries
    return df

# Example usage (assuming podcast_data is your DataFrame with 'Transcript' and 'Summary' columns):
df_with_summaries = add_gpt_summaries(subset_podcast_data)



In [None]:
import os
import openai
from dotenv import load_dotenv
import pandas as pd

# Load environment variables from .env file
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_memorable_quotes(transcript: str, existing_summary: str) -> str:
    """
    Uses GPT-4o-mini to extract the most memorable quotes from the podcast transcript,
    explaining each quote with context and including the last time stamp for that quote.
    Returns a bullet-point list of quotes with their context.
    """
    prompt_content = (
        "Below is the transcript and an existing summary for a podcast episode. "
        "Extract the most memorable quotes from the transcript and, for each quote, "
        "provide a brief explanation of its context along with the last time stamp where that quote appears. "
        "Present the results as a bullet-point list, where each bullet contains the quote, context, and time stamp. "
        "Do not include any additional commentary.\n\n"
        f"Transcript:\n{transcript}\n\n"
        f"Existing Summary:\n{existing_summary}"
    )
    
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a helpful assistant skilled in extracting memorable quotes from podcasts. "
                    "Your output should be clear, concise, and formatted as a bullet-point list. "
                    "Your audience is data scientists and machine learning engineers who are listening to try to expand their skillsets and learn from the experiences of the podcasters."
                    "When extracting quotes, focus on the ones that are impactful, insightful, or thought-provoking. "
                    "When stating people's names, use their full names. "
                    "Each bullet should contain the quote, the speakers full name, a brief explanation of its context, and the last time stamp in the transcript where that quote occurs."
                )
            },
            {
                "role": "user",
                "content": prompt_content
            }
        ],
        temperature=0.2
    )
    return response.choices[0].message.content.strip()

def add_gpt_memorable_quotes(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each row in the DataFrame, sends both the transcript and the existing summary to the model for extracting memorable quotes,
    then stores the result in a new 'GPT Memorable Quotes' column.
    """
    quotes_list = []
    for _, row in df.iterrows():
        transcript = row['Transcript']
        # Use the 'Summary' column if it exists; otherwise, default to an empty string.
        existing_summary = row.get('GPT Summary', '')
        quotes_text = generate_memorable_quotes(transcript, existing_summary)
        quotes_list.append(quotes_text)
    df['GPT Memorable Quotes'] = quotes_list
    return df

# Example usage (assuming podcast_data is your DataFrame with 'Transcript' and 'Summary' columns):
df_with_memorable_quotes = add_gpt_memorable_quotes(df_with_summaries)


In [None]:
df_for_chunks = subset_podcast_data[['Title', 'URL','Date Published', 'episode number', 'Transcript']]



In [15]:
import os
import json
import asyncio
import pandas as pd
import re
from datetime import datetime, timezone
from typing import List, Dict, Any
from openai import AsyncOpenAI
from supabase import create_client, Client
from dataclasses import dataclass

# Load environment variables
openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

supabase: Client = create_client(
    os.getenv("SUPABASE_URL"),
    os.getenv("SUPABASE_SERVICE_KEY")
)

# Limit concurrent API calls to 4 at a time (adjust as needed)
API_SEMAPHORE = asyncio.Semaphore(4)
import asyncio
import time
import random

@dataclass
class ProcessedChunk:
    episode_number: int
    title: str
    url: str
    chunk_number: int
    summary: str
    content: str
    metadata: Dict[str, Any]
    embedding: List[float]

def find_timestamps(text: str) -> List[int]:
    """
    Identify all timestamps in the transcript text.
    Returns a list of character indices where timestamps occur.
    """
    timestamp_pattern = re.compile(r"(?:\d{1,2}:\d{2}(?::\d{2}(?:\.\d{1,2})?)?)")
    matches = [m.start() for m in timestamp_pattern.finditer(text)]
    return matches

def chunk_transcript(transcript: str, episode_number: int, chunk_size: int = 7000) -> List[str]:
    """
    Chunk transcript while respecting timestamps as boundaries.
    If no timestamps exist (episodes 001-050), chunk based on paragraph or sentence breaks.
    """
    chunks = []
    start = 0
    text_length = len(transcript)

    # Extract timestamps if available (episodes > 50)
    timestamps = find_timestamps(transcript) if episode_number > 50 else []

    while start < text_length:
        end = start + chunk_size

        # If timestamps exist, find the nearest valid chunk end
        if timestamps:
            valid_timestamps = [t for t in timestamps if start < t < end]
            if valid_timestamps:
                end = max(valid_timestamps)  # Extend to the closest timestamp within range

        # If no timestamps, fallback to paragraph or sentence-based chunking
        else:
            paragraph_break = transcript.rfind('\n\n', start, end)
            sentence_break = transcript.rfind('. ', start, end)

            if paragraph_break > start + chunk_size * 0.3:  # Favor paragraph breaks
                end = paragraph_break
            elif sentence_break > start + chunk_size * 0.3:  # Then sentence breaks
                end = sentence_break + 1

        # Extract chunk and clean it up
        chunk = transcript[start:end].strip()
        if chunk:
            chunks.append(chunk)

        # Move to the next chunk
        start = end

    return chunks

async def get_summary(chunk: str) -> str:
    """Generate a GPT-crafted summary with strict rate limiting and retries."""
    prompt = f"Summarize the following podcast transcript section. When doing so please refer to the speakers by their full name:\n\n{chunk[:1000]}..."
    
    retries = 7  # Number of retries
    base_delay = 2  # Start with a 2-second delay
    max_delay = 30  # Maximum wait time per request

    async with API_SEMAPHORE:  # Limit concurrent API requests
        for attempt in range(retries):
            try:
                # Add a small sleep to naturally space requests
                await asyncio.sleep(2)  

                response = await openai_client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[{"role": "user", "content": prompt}]
                )
                return response.choices[0].message.content.strip()

            except Exception as e:
                error_message = str(e)
                
                if "rate_limit_exceeded" in error_message:
                    wait_time = min(base_delay * (2 ** attempt) + random.uniform(0, 2), max_delay)
                    print(f"Rate limit hit. Retrying in {wait_time:.2f} seconds...")
                    await asyncio.sleep(wait_time)
                else:
                    print(f"Error generating summary: {error_message}")
                    break  # If it's not a rate limit error, stop retrying

    return "Error generating summary"

async def get_embedding(text: str) -> List[float]:
    """Get embedding vector from OpenAI."""
    try:
        response = await openai_client.embeddings.create(
            model="text-embedding-3-small",
            input=text
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error getting embedding: {e}")
        return [0] * 1536  # Return zero vector on error

async def process_chunk(chunk: str, chunk_number: int, episode_number: int, title: str, url: str) -> ProcessedChunk:
    """Process a single chunk: generate summary and embedding."""
    summary = await get_summary(chunk)
    embedding = await get_embedding(chunk)
    
    metadata = {
        "episode_number": episode_number,
        "chunk_size": len(chunk),
        "processed_at": datetime.now(timezone.utc).isoformat(),
    }
    
    return ProcessedChunk(
        episode_number=episode_number,
        title=title,
        url=url,
        chunk_number=chunk_number,
        summary=summary,
        content=chunk,
        metadata=metadata,
        embedding=embedding
    )

async def insert_chunk(chunk: ProcessedChunk):
    """Insert a processed chunk into Supabase."""
    try:
        data = {
            "episode_number": chunk.episode_number,
            "title": chunk.title,
            "url": chunk.url,
            "chunk_number": chunk.chunk_number,
            "summary": chunk.summary,
            "content": chunk.content,
            "metadata": json.dumps(chunk.metadata),
            "embedding": chunk.embedding
        }
        
        result = supabase.table("podcast_transcripts").insert(data).execute()
        print(f"Inserted chunk {chunk.chunk_number} for episode {chunk.episode_number}")
        return result
    except Exception as e:
        print(f"Error inserting chunk: {e}")
        return None

async def process_and_store_episode(df_row):
    """Process an entire podcast episode's transcript and store its chunks."""
        # Convert episode number from string to integer safely
    try:
        episode_number = int(df_row["episode number"])
    except ValueError:
        print(f"Warning: Could not convert episode number '{df_row['episode number']}' to int.")
        return  # Skip processing this episode if it fails conversion
    
    title = df_row["Title"]
    url = df_row["URL"]
    transcript = df_row["Transcript"]

    # Split into chunks
    chunks = chunk_transcript(transcript, episode_number)

    # Process chunks in parallel
    tasks = [
        process_chunk(chunk, i, episode_number, title, url)
        for i, chunk in enumerate(chunks)
    ]
    processed_chunks = await asyncio.gather(*tasks)

    # Store chunks in parallel
    insert_tasks = [insert_chunk(chunk) for chunk in processed_chunks]
    await asyncio.gather(*insert_tasks)

async def process_all_episodes(df: pd.DataFrame):
    """Clears the table and processes all episodes in the dataframe."""
    
    # Clear existing data before inserting new chunks
    try:
        supabase.table("podcast_transcripts").delete().neq("id", 0).execute()
        print("Cleared podcast_transcripts table before re-uploading.")
    except Exception as e:
        print(f"Error clearing table: {e}")

    # Process all episodes
    tasks = [process_and_store_episode(df.iloc[i]) for i in range(len(df))]
    await asyncio.gather(*tasks)

import nest_asyncio
nest_asyncio.apply()  # Allows running nested event loops

await process_all_episodes(podcast_data)  # Use await instead of asyncio.run()


Cleared podcast_transcripts table before re-uploading.
Inserted chunk 0 for episode 130
Inserted chunk 1 for episode 130
Inserted chunk 2 for episode 130
Inserted chunk 3 for episode 130
Inserted chunk 4 for episode 130
Inserted chunk 5 for episode 130
Inserted chunk 6 for episode 130
Inserted chunk 7 for episode 130
Inserted chunk 8 for episode 130
Inserted chunk 9 for episode 130
Inserted chunk 10 for episode 130
Inserted chunk 0 for episode 160
Inserted chunk 1 for episode 160
Inserted chunk 2 for episode 160
Inserted chunk 3 for episode 160
Inserted chunk 4 for episode 160
Inserted chunk 5 for episode 160
Inserted chunk 6 for episode 160
Inserted chunk 7 for episode 160
Inserted chunk 8 for episode 160
Inserted chunk 9 for episode 160
Inserted chunk 10 for episode 160
Inserted chunk 0 for episode 161
Inserted chunk 1 for episode 161
Inserted chunk 2 for episode 161
Inserted chunk 3 for episode 161
Inserted chunk 4 for episode 161
Inserted chunk 5 for episode 161
Inserted chunk 6 fo