In [None]:
import pickle
import os
from dotenv import load_dotenv
from openai import AsyncOpenAI
from supabase import create_client, Client

# Load the podcast data from the pickle file
with open('podcast_data.pkl', 'rb') as file:
    podcast_data = pickle.load(file)

# Load environment variables from the .env file
load_dotenv()

# Initialize OpenAI client
openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Initialize Supabase client
supabase: Client = create_client(
    os.getenv("SUPABASE_URL"),
    os.getenv("SUPABASE_SERVICE_KEY")
)


In [None]:

episode_number_to_match = ['042','043','132','133','200','201']

# Create a subset of the podcast data where the episode number matches the specified value
subset_podcast_data = podcast_data[podcast_data['episode number'] in episode_number_to_match]


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
import os
import openai
from dotenv import load_dotenv
import pandas as pd

# Load environment variables from .env file
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_summary(transcript: str, existing_summary: str) -> str:
    """
    Uses GPT-4o-mini to summarize the key takeaway points from both the transcript
    and the existing summary. Returns a concise bullet-point list.
    """
    prompt_content = (
        "Below is the transcript and an existing summary for a podcast episode. "
        "Summarize the key takeaway points in bullet-point form, with each bullet representing a distinct key point. You can use multiple sentences if you need to. "
        "Make sure to include all the important details and main ideas. "
        "Do not include any additional commentary.\n\n"
        f"Transcript:\n{transcript}\n\n"
        f"Existing Summary:\n{existing_summary}"
    )
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a helpful assistant that creates clear, concise summaries of podcasts. "
                    "You think from the perspective of a listener who wants to quickly grasp the main points. "
                    "Your audience is data scientists and machine learning engineers who are listening to try to expand their skillsets and learn from the experiences of the podcasters."
                )
            },
            {
                "role": "user",
                "content": prompt_content
            }
        ],
        temperature=0.2
    )
    return response.choices[0].message.content.strip()

def add_gpt_summaries(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each row in the DataFrame, sends both the transcript and the existing summary to the model for summarization,
    then stores the result in a new 'GPT Summary' column.
    """
    summaries = []
    for _, row in df.iterrows():
        transcript = row['Transcript']
        # Use the 'Summary' column if it exists; otherwise, default to an empty string.
        existing_summary = row.get('Summary', '')
        summary_text = generate_summary(transcript, existing_summary)
        summaries.append(summary_text)
    df['GPT Summary'] = summaries
    return df

# Example usage (assuming podcast_data is your DataFrame with 'Transcript' and 'Summary' columns):
df_with_summaries = add_gpt_summaries(podcast_data)



In [None]:
import os
import openai
from dotenv import load_dotenv
import pandas as pd

# Load environment variables from .env file
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_memorable_quotes(transcript: str, existing_summary: str) -> str:
    """
    Uses GPT-4o-mini to extract the most memorable quotes from the podcast transcript,
    explaining each quote with context and including the last time stamp for that quote.
    Returns a bullet-point list of quotes with their context.
    """
    prompt_content = (
        "Below is the transcript and an existing summary for a podcast episode. "
        "Extract the most memorable quotes from the transcript and, for each quote, "
        "provide a brief explanation of its context along with the last time stamp where that quote appears. "
        "Present the results as a bullet-point list, where each bullet contains the quote, context, and time stamp. "
        "Do not include any additional commentary.\n\n"
        f"Transcript:\n{transcript}\n\n"
        f"Existing Summary:\n{existing_summary}"
    )
    
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a helpful assistant skilled in extracting memorable quotes from podcasts. "
                    "Your output should be clear, concise, and formatted as a bullet-point list. "
                    "Your audience is data scientists and machine learning engineers who are listening to try to expand their skillsets and learn from the experiences of the podcasters."
                    "When extracting quotes, focus on the ones that are impactful, insightful, or thought-provoking. "
                    "When stating people's names, use their full names. "
                    "Each bullet should contain the quote, the speakers full name, a brief explanation of its context, and the last time stamp in the transcript where that quote occurs."
                )
            },
            {
                "role": "user",
                "content": prompt_content
            }
        ],
        temperature=0.2
    )
    return response.choices[0].message.content.strip()

def add_gpt_memorable_quotes(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each row in the DataFrame, sends both the transcript and the existing summary to the model for extracting memorable quotes,
    then stores the result in a new 'GPT Memorable Quotes' column.
    """
    quotes_list = []
    for _, row in df.iterrows():
        transcript = row['Transcript']
        # Use the 'Summary' column if it exists; otherwise, default to an empty string.
        existing_summary = row.get('GPT Summary', '')
        quotes_text = generate_memorable_quotes(transcript, existing_summary)
        quotes_list.append(quotes_text)
    df['GPT Memorable Quotes'] = quotes_list
    return df

# Example usage (assuming podcast_data is your DataFrame with 'Transcript' and 'Summary' columns):
df_with_memorable_quotes = add_gpt_memorable_quotes(podcast_data)


In [None]:
for _, row in df_with_memorable_quotes.iterrows():
    print(f"Title: {row['Title']}")
    print(f"GPT Summary: {row['GPT Summary']}")
    print(f"\nGPT Memorable Quotes: {row['GPT Memorable Quotes']}")
    print("\n")

In [None]:
df_for_chunks = df_with_memorable_quotes[['Title', 'URL','Date Published', 'episode number', 'Transcript']]
print(df_subset)

In [None]:
import os
import json
import asyncio
import pandas as pd
from datetime import datetime, timezone
from typing import List, Dict, Any
from openai import AsyncOpenAI
from supabase import create_client, Client

# Load environment variables
openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

supabase: Client = create_client(
    os.getenv("SUPABASE_URL"),
    os.getenv("SUPABASE_SERVICE_KEY")
)

@dataclass
class ProcessedChunk:
    episode_number: int
    title: str
    url: str
    chunk_number: int
    summary: str
    content: str
    metadata: Dict[str, Any]
    embedding: List[float]

def chunk_transcript(transcript: str, timestamps: List[int], chunk_size: int = 2000) -> List[str]:
    """
    Chunk transcript while respecting timestamps (avoiding splitting within speech blocks).
    """
    chunks = []
    start = 0
    text_length = len(transcript)
    
    while start < text_length:
        end = start + chunk_size

        # Find the nearest timestamp that doesn't split dialogue
        while end < text_length and end not in timestamps:
            end -= 1

        # If no valid timestamp is found, move to the next best split
        if end == start:
            end = min(start + chunk_size, text_length)

        # Extract chunk and clean it up
        chunk = transcript[start:end].strip()
        if chunk:
            chunks.append(chunk)

        # Move to next chunk
        start = end

    return chunks

async def get_summary(chunk: str) -> str:
    """Generate a GPT-crafted summary for each chunk."""
    prompt = f"Summarize the following podcast transcript section:\n\n{chunk[:1000]}..."
    
    try:
        response = await openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating summary: {e}")
        return "Error generating summary"

async def get_embedding(text: str) -> List[float]:
    """Get embedding vector from OpenAI."""
    try:
        response = await openai_client.embeddings.create(
            model="text-embedding-3-small",
            input=text
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error getting embedding: {e}")
        return [0] * 1536  # Return zero vector on error

async def process_chunk(chunk: str, chunk_number: int, episode_number: int, title: str, url: str) -> ProcessedChunk:
    """Process a single chunk: generate summary and embedding."""
    summary = await get_summary(chunk)
    embedding = await get_embedding(chunk)
    
    metadata = {
        "episode_number": episode_number,
        "chunk_size": len(chunk),
        "processed_at": datetime.now(timezone.utc).isoformat(),
    }
    
    return ProcessedChunk(
        episode_number=episode_number,
        title=title,
        url=url,
        chunk_number=chunk_number,
        summary=summary,
        content=chunk,
        metadata=metadata,
        embedding=embedding
    )

async def insert_chunk(chunk: ProcessedChunk):
    """Insert a processed chunk into Supabase."""
    try:
        data = {
            "episode_number": chunk.episode_number,
            "title": chunk.title,
            "url": chunk.url,
            "chunk_number": chunk.chunk_number,
            "summary": chunk.summary,
            "content": chunk.content,
            "metadata": json.dumps(chunk.metadata),
            "embedding": chunk.embedding
        }
        
        result = supabase.table("podcast_transcripts").insert(data).execute()
        print(f"Inserted chunk {chunk.chunk_number} for episode {chunk.episode_number}")
        return result
    except Exception as e:
        print(f"Error inserting chunk: {e}")
        return None

async def process_and_store_episode(df_row):
    """Process an entire podcast episode's transcript and store its chunks."""
    episode_number = df_row["episode number"]
    title = df_row["Title"]
    url = df_row["URL"]
    transcript = df_row["Transcript"]

    # Example timestamps list (real timestamps should be extracted from transcript data)
    timestamps = [i for i in range(0, len(transcript), 500)]  # Example placeholder

    # Split into chunks
    chunks = chunk_transcript(transcript, timestamps)

    # Process chunks in parallel
    tasks = [
        process_chunk(chunk, i, episode_number, title, url)
        for i, chunk in enumerate(chunks)
    ]
    processed_chunks = await asyncio.gather(*tasks)

    # Store chunks in parallel
    insert_tasks = [insert_chunk(chunk) for chunk in processed_chunks]
    await asyncio.gather(*insert_tasks)

async def process_all_episodes(df: pd.DataFrame):
    """Process all episodes in the dataframe."""
    tasks = [process_and_store_episode(df.iloc[i]) for i in range(len(df))]
    await asyncio.gather(*tasks)

# Run the script
if __name__ == "__main__":
    df_for_chunks = pd.read_csv("your_podcast_data.csv")  # Load your data
    asyncio.run(process_all_episodes(df_for_chunks))
