In [None]:
# Extract youtube video link from youtube channel
import yt_dlp
import csv

def save_channel_videos_to_file(channel_url, filename="dilmah_videos.csv"):
    # Configure options for yt-dlp
    ydl_opts = {
        'quiet': True,
        'extract_flat': True,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            print(f"Fetching videos from {channel_url}...")
            result = ydl.extract_info(channel_url, download=False)

            if 'entries' in result:
                # Open a new CSV file to write the data
                with open(filename, mode='w', newline='', encoding='utf-8') as file:
                    writer = csv.writer(file)
                    # Write the header row
                    writer.writerow(["Video Title", "Video URL"])
                    
                    count = 0
                    for entry in result['entries']:
                        video_url = f"https://www.youtube.com/watch?v={entry['id']}"
                        video_title = entry.get('title', 'No Title')
                        
                        # Write the data row
                        writer.writerow([video_title, video_url])
                        count += 1
                
                print(f"Success! {count} video links saved to '{filename}'.")
            else:
                print("No videos found.")

        except Exception as e:
            print(f"An error occurred: {e}")

# Target Channel and run
target_url = "https://www.youtube.com/@DilmahRealTea/videos"
# save_channel_videos_to_file(target_url)

Fetching videos from https://www.youtube.com/@DilmahRealTea/videos...
Success! 1685 video links saved to 'dilmah_videos.csv'.


In [13]:
# Dowload a audio from a youtube video link
import yt_dlp

def download_youtube_audio(url):
    ydl_opts = {
        'format': 'bestaudio/best',  # Selects the best audio quality
        'postprocessors': [{
            'key': 'FFmpegExtractAudio', # Uses FFmpeg to extract audio
            'preferredcodec': 'mp3',      # Convert to mp3
            'preferredquality': '192',    # Audio bitrate
        }],
        'outtmpl': '%(title)s.%(ext)s',  # Save file as "Title.mp3"
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            print("Extracting audio...")
            ydl.download([url])
            print("\nAudio download complete!")
    except Exception as e:
        print(f"An error occurred: {e}")

# Usage
video_url = "https://www.youtube.com/watch?v=cnuSE24ECuM"
download_youtube_audio(video_url)

Extracting audio...
[youtube] Extracting URL: https://www.youtube.com/watch?v=cnuSE24ECuM
[youtube] cnuSE24ECuM: Downloading webpage




[youtube] cnuSE24ECuM: Downloading android sdkless player API JSON
[youtube] cnuSE24ECuM: Downloading web safari player API JSON




[youtube] cnuSE24ECuM: Downloading m3u8 information




[info] cnuSE24ECuM: Downloading 1 format(s): 251-12
[download] Destination: Blessed Christmas Wishes from the Dilmah Family 2025.webm
[download] 100% of    2.09MiB in 00:00:00 at 2.11MiB/s   


ERROR: Postprocessing: ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location


An error occurred: ERROR: Postprocessing: ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location


In [15]:
# get transcript from youtube audio
import os
from dotenv import load_dotenv
load_dotenv()
from groq import Groq
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

client = Groq(api_key=GROQ_API_KEY)
filename = "Blessed Christmas Wishes from the Dilmah Family 2025.webm"

with open(filename, "rb") as file:
    transcription = client.audio.transcriptions.create(
      file=(filename, file.read()),
      model="whisper-large-v3-turbo",
      temperature=0,
      response_format="verbose_json",
    )
    print(transcription.text)

 Hello friends, as we celebrate Christmas, things that are happening in the world around us demand reflection on what truly matters. Values, purpose and how we can positively impact people and nature. Our grandfather wanted to make the world a better tea and by better he meant kinder, better for our health, better for nature and better tasting. He meant it sincerely, which is why Dilma has a foundation of quality and integrity with a heart of kindness. My father's dream became reality 40 years ago. It has now become the global mission for the family to touch the lives of hundreds of thousands of less fortunate people every year through our work in tea, tourism and cinnamon. Business must serve humanity. Life without purpose is meaningless. So for us this blessed season is as much about hope as it is about giving. We cannot live without hope. Yet conflict chaos and inequality around the world are pushing many towards hopelessness The coolest thing about all this is that the happiness th

In [5]:
# filter the csv file to get only the successful transcriptions
import pandas as pd

# Input and output file paths
input_file = "dilmah_videos_transcription.csv"
output_file = "dilmah_videos_transcription_filtered.csv"

# Read the CSV
df = pd.read_csv(input_file)

# Filter rows where Status is 'Success'
df_success = df[df["Status"] == "Success"]

# Save to a new CSV (without index column)
df_success.to_csv(output_file, index=False)

print(f"Filtered file saved as: {output_file}")

Filtered file saved as: dilmah_videos_transcription_filtered.csv


In [None]:
# summarize the transcript for RAG indexing
import os
import pandas as pd
from dotenv import load_dotenv
from groq import Groq
import time

# Load environment variables
load_dotenv()
client = Groq(api_key=os.getenv("GROQ_API_KEY"))

INPUT_CSV = "dilmah_videos_transcription_part_6.csv"
OUTPUT_CSV = "data_rag_ready.csv"

RAG_PROMPT = """
You are preparing content for a Retrieval-Augmented Generation (RAG) system.

Task:
Create a dense, factual summary of the transcript optimized for semantic retrieval.

Rules:
- Preserve key entities (people, brands, organizations, locations)
- Preserve facts, dates, numbers, milestones
- Remove filler, greetings, repetition, and emotional language
- Do NOT add new information
- Do NOT use marketing language
- Use neutral, information-dense sentences
- 3–5 sentences maximum

Transcript:
{transcript}
"""

def summarize_for_rag(transcript):
    if not isinstance(transcript, str) or transcript.strip() == "":
        return ""

    response = client.chat.completions.create(
        model="openai/gpt-oss-120b",
        messages=[
            {"role": "system", "content": "You summarize transcripts for RAG indexing."},
            {"role": "user", "content": RAG_PROMPT.format(transcript=transcript)}
        ],
        temperature=0.2,
        max_completion_tokens=400,
        top_p=1,
        reasoning_effort="medium"
    )

    return response.choices[0].message.content.strip()

def main():
    df = pd.read_csv(INPUT_CSV)

    rag_summaries = []

    for idx, row in df.iterrows():
        print(f"Processing {idx + 1}/{len(df)}")

        if row.get("Status") != "Success":
            rag_summaries.append("")
            continue

        summary = summarize_for_rag(row["Transcript"])
        rag_summaries.append(summary)

        # Gentle rate limiting
        time.sleep(1)

    df["Summary"] = rag_summaries
    df.to_csv(OUTPUT_CSV, index=False)

    print(f"\n✅ RAG-ready CSV saved as: {OUTPUT_CSV}")

# if __name__ == "__main__":
#     main()