# Podcast Transcript Summarizer and Recommendation Engine

This notebook covers:
1. Fetching podcast transcripts.
2. Summarizing transcripts using an LLM.
3. Keyword search within transcripts.
4. Content recommendation engine using embeddings.


## 1. Install Dependencies

Install required libraries.

In [None]:
!pip install requests bs4 transformers sentence-transformers scikit-learn
!pip install datasets transformers
!git clone https://github.com/FelipeGRK/theamericanlifepodcast.git
!pip install datasets transformers ipywidgets


## 2. Import Libraries

Import necessary libraries and modules.

In [None]:
import os
import requests
from getpass import getpass

# === Data Handling & Processing ===
from datasets import Dataset
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity

# === Transformers & Hugging Face Utilities ===
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from huggingface_hub import InferenceClient, login

# === Interactive Widgets & Display ===
import ipywidgets as widgets
from IPython.display import display, clear_output



## 3. Authenticate with Hugging Face

Sign in to Hugging Face using an API Key.

In [None]:
hf_api_key = getpass("Please enter your Hugging Face API key: ")
login(token=hf_api_key)

## 5. Fetch Podcast Transcripts

Fetch transcripts from provided URLs.

In [None]:
def fetch_transcript(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        transcript_div = soup.find("div", class_="transcript")
        if transcript_div:
            transcript_text = transcript_div.get_text(separator="\n")
        else:
            transcript_text = soup.get_text(separator="\n")
        return transcript_text.strip()
    else:
        return None

transcript_urls = [
    "https://www.thisamericanlife.org/1/transcript",
    "https://www.thisamericanlife.org/2/transcript",
    "https://www.thisamericanlife.org/3/transcript",
    "https://www.thisamericanlife.org/4/transcript",
    "https://www.thisamericanlife.org/5/transcript",
    "https://www.thisamericanlife.org/6/transcript",
    "https://www.thisamericanlife.org/7/transcript",
    "https://www.thisamericanlife.org/8/transcript",
    "https://www.thisamericanlife.org/9/transcript",
    "https://www.thisamericanlife.org/10/transcript",
]

transcripts = []
for url in transcript_urls:
    transcript_text = fetch_transcript(url)
    if transcript_text:
        transcripts.append(transcript_text)
        print(f"Transcript fetched from {url}")
    else:
        print(f"Failed to retrieve transcript from {url}")

if transcripts:
    print("Transcripts fetched successfully.")
else:
    print("Failed to fetch transcripts.")

## 7. Define a Custom Prompt

Create a function to generate custom prompts for summarization.

In [None]:
def generate_prompt(transcript, episode, title, date):
    prompt = f"""
You are an assistant specialized in summarizing podcast episodes.
From the following transcript, generate a concise and informative summary that includes:
- Episode number: {episode}
- Title: {title}
- Publication date: {date}
- Main topics discussed in the episode
- Names of speakers and guests mentioned

Transcript:
{transcript}

Respond with a clear and structured summary.
"""
    return prompt

## 8. Create Prompts for the First 10 Episodes

Construct prompts using the fetched transcripts and metadata.

In [None]:
metadata = [
    {"episode": "001", "title": "New Beginnings", "date": "95-11-17"},
    {"episode": "002", "title": "Small Scale Sin", "date": "95-11-24"},
    {"episode": "003", "title": "A Violent Utopia", "date": "95-12-01"},
    {"episode": "004", "title": "Animals", "date": "95-12-08"},
    {"episode": "005", "title": "Anger and Forgiveness", "date": "95-12-15"},
    {"episode": "006", "title": "Poultry Slam 1995", "date": "95-12-22"},
    {"episode": "007", "title": "Quitting", "date": "96-01-05"},
    {"episode": "008", "title": "On Work", "date": "96-01-12"},
    {"episode": "009", "title": "Julia Sweeney", "date": "96-01-19"},
    {"episode": "010", "title": "Double Lives", "date": "96-01-26"},
]

prompts = []
for i, transcript in enumerate(transcripts):
    prompt_custom = generate_prompt(transcript, metadata[i]["episode"], metadata[i]["title"], metadata[i]["date"])
    prompts.append(prompt_custom)
    print(f"Custom prompt created for episode {metadata[i]['episode']}.")

6. Select Podcast Transcript
Allow the user to select the podcast transcript they want to see and provide an option to listen to the podcast.


In [None]:
def display_transcript_options(transcripts, metadata):
    for i, meta in enumerate(metadata):
        print(f"{i+1}. Title: {meta['title']} - Publication date: {meta['date']}")

    selection = int(input("Select the transcript you want to see (1-10): ")) - 1
    if 0 <= selection < len(transcripts):
        print(f"Showing transcript for episode: {metadata[selection]['title']}")
        print(transcripts[selection])
    else:
        print("Invalid selection.")

display_transcript_options(transcripts, metadata)

# 9. Summarize the Transcripts
Loading and Generating summaries for each transcript.


In [None]:
from datasets import Dataset
from transformers import pipeline

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Path to the folder
folder_path = "/content/theamericanlifepodcast/transcript-text"

# Function to split text chunks for summarization
def chunk_text(text, max_chunk_chars=1024):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        if current_length + len(word) + 1 > max_chunk_chars:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1
        else:
            current_chunk.append(word)
            current_length += len(word) + 1
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize a transcript from my dataset
def summarize_transcript(batch):
    text = batch["transcript"]
    summary = ""
    chunks = chunk_text(text, max_chunk_chars=1024)
    for chunk in chunks:
        # Adjust summarization parameters
        input_length = len(chunk.split())
        max_len = min(200, int(input_length * 0.8))
        min_len = min(50, int(input_length * 0.4))

        generated = summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False)
        summary += generated[0]['summary_text'] + " "
    batch["summary"] = summary.strip()
    return batch

# Read all .txt files from the folder and create a dataset
files = sorted([f for f in os.listdir(folder_path) if f.endswith(".txt")])
data = {"filename": [], "transcript": []}

for file_name in files:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        data["filename"].append(file_name)
        data["transcript"].append(file.read())

# Create a dataset from the loaded data
dataset = Dataset.from_dict(data)

# Preview the dataset
print("Dataset loaded. Preview:")
print(dataset)

# Apply the summarization function to each record in the dataset
dataset = dataset.map(summarize_transcript)

# Print the summaries for all files
for record in dataset:
    print("Filename:", record["filename"])
    print("Generated Summary:\n", record["summary"])
    print("\n" + "="*80 + "\n")


#Dropdown list to Select a Podcast for Summarization
LOADING PODCAST TRANSCCRIPTS BASED THE USER CHOICE

In [None]:

# Cell: Interactive Dropdown to Display Summaries
import ipywidgets as widgets
from IPython.display import display, clear_output

# Prepare dropdown options
options = [(rec["filename"], i) for i, rec in enumerate(dataset)]
dropdown = widgets.Dropdown(
    options=options,
    description='Podcast:',
    value=options[0][1]  # Default to the first transcript
)

output = widgets.Output()

def on_selection_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        selected_index = change['new']
        output.clear_output()
        # Show a brief "Loading" indicator
        with output:
            print("Loading summarized version, please wait...\n")
        # Retrieve the recorrd
        selected_record = dataset[selected_index]
        output.clear_output()
        with output:
            print("Filename:", selected_record["filename"])
            print("Generated Summary:\n", selected_record["summary"])
            print("-" * 60)

dropdown.observe(on_selection_change, names='value')
display(dropdown, output)

## 10. Keyword Search Engine

Implement a keyword search engine.

In [None]:
def keyword_search(transcript, keyword):
    lines = transcript.split('\n')
    results = [line for line in lines if keyword.lower() in line.lower()]
    return results

if 'transcripts' in globals():
    keyword = "betray"
    search_results = keyword_search(transcripts[1], keyword)

    print(f"Search results for '{keyword}':\n")
    for result in search_results:
        print(result)
else:
    print("Transcripts are not defined. Please run the fetching cell .")

## 11. Semantic Search with Embeddings

Implement a semantic search engine using embeddings.

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
transcript_embeddings = model.encode(transcripts)

def semantic_search(query, embeddings, top_k=5):
    query_embedding = model.encode([query])[0]
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    similar_indices = similarities.argsort()[-top_k:][::-1]
    return similar_indices, similarities

query = "economic impact"
similar_indices, similarities = semantic_search(query, transcript_embeddings)

print("Semantic search results for query \"economic impact\":\n")
for idx in similar_indices:
    print(f"Episode {metadata[idx]['episode']}, Similarity: {similarities[idx]:.4f}")

## 12. Content Recommendation Engine

Implement a content recommendation engine using embeddings.

In [None]:
def recommend_episodes(transcript_embedding, all_embeddings, top_k=5):
    similarities = cosine_similarity([transcript_embedding], all_embeddings)[0]
    similar_indices = similarities.argsort()[-top_k:][::-1]
    return similar_indices, similarities

example_transcript_embedding = transcript_embeddings[1]
similar_indices, similarities = recommend_episodes(example_transcript_embedding, transcript_embeddings)

print("Recommended episodes based on similarity:\n")
for idx in similar_indices:
    print(f"Episode {metadata[idx]['episode']}, Similarity: {similarities[idx]:.4f}")