CHAT BOT

In [None]:
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceEmbeddings  # free, no API key

# ===============================
# Groq API setup
# ===============================
llm = ChatGroq(
    temperature=0,
    groq_api_key="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",
    model_name="meta-llama/llama-4-scout-17b-16e-instruct"
)

# ===============================
# Helper Functions
# ===============================
def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf.strip())
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=20
    )
    chunks = text_splitter.split_text(text)
    return chunks


def get_vector_store(text_chunks):
    # Use HuggingFace embeddings (local, no API needed)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    return vector_store


def get_conversational_chain(vector_store):
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True
    )
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vector_store.as_retriever(),
        memory=memory
    )
    return conversation_chain


# ===============================
# CLI Chat Functions
# ===============================
def user_input(conversation, user_question):
    response = conversation({'question': user_question})
    chat_history = response['chat_history']
    for i, message in enumerate(chat_history):
        if i % 2 == 0:
            print("User:", message.content)
        else:
            print("Reply:", message.content)


def main():
    # Ask user for PDF path(s)
    pdf_paths = input("Enter PDF file paths (comma separated): ").split(",")

    # Load and process PDF(s)
    raw_text = get_pdf_text(pdf_paths)
    text_chunks = get_text_chunks(raw_text)
    vector_store = get_vector_store(text_chunks)

    # Create conversation chain
    conversation = get_conversational_chain(vector_store)

    print("‚úÖ System is ready! Ask questions (type 'exit' to quit).\n")

    # Chat loop
    while True:
        user_question = input("You: ")
        if user_question.lower() in ["exit", "quit"]:
            print("üëã Exiting...")
            break
        user_input(conversation, user_question)


if __name__ == "__main__":
    main()

FLASHCARD

In [None]:
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceEmbeddings  # free, no API key
from langchain.vectorstores import FAISS
import os
import random

# ===============================
# Groq API setup
# ===============================
llm = ChatGroq(
    temperature=0.3,
    groq_api_key="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",
    model_name="meta-llama/llama-4-maverick-17b-128e-instruct"
)

# ===============================
# Helper Functions
# ===============================
def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf.strip())
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
    return text

def get_text_chunks(text, chunk_size=1000, overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap
    )
    return splitter.split_text(text)

def generate_flashcards_from_chunk(chunk, num_cards=2):
    prompt = f"""
    Create {num_cards} flashcards (question-answer pairs) from the following text.
    Format strictly as JSON list like:
    [
      {{"question": "Q1?", "answer": "A1"}},
      {{"question": "Q2?", "answer": "A2"}}
    ]

    Text:
    {chunk}
    """
    try:
        response = llm.invoke(prompt)
        return eval(response.content)
    except Exception as e:
        print(f"‚ö†Ô∏è Error generating flashcards from chunk: {e}")
        return []

def generate_flashcards(text, final_count=5):
    chunks = get_text_chunks(text)
    flashcards = []

    # collect small batches from each chunk
    for chunk in chunks[:5]:  # limit chunks to avoid overload
        flashcards.extend(generate_flashcards_from_chunk(chunk, num_cards=2))

    # randomly pick exactly `final_count` cards
    if len(flashcards) > final_count:
        flashcards = random.sample(flashcards, final_count)

    return flashcards

# ===============================
# Main
# ===============================
def main():
    pdf_paths = ["/content/NOTES UNIT-4 ANN.pdf"]

    raw_text = get_pdf_text(pdf_paths)

    print("‚è≥ Generating 5 flashcards...")
    flashcards = generate_flashcards(raw_text, final_count=5)

    print("\n‚úÖ Flashcards Generated!\n")
    for i, card in enumerate(flashcards, 1):
        print(f"{i}. Q: {card['question']}")
        print(f"   A: {card['answer']}\n")

if __name__ == "__main__":
    main()

QUIZ

In [None]:
import os
import json
import random
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq

# ===============================
# Groq API setup
# ===============================
llm = ChatGroq(
    temperature=0.3,
    groq_api_key="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",
    model_name="meta-llama/llama-4-maverick-17b-128e-instruct"
)

# ===============================
# Helper Functions
# ===============================
def get_pdf_text(pdf_paths):
    """Extract text from PDF files"""
    text = ""
    for pdf_path in pdf_paths:
        try:
            if os.path.exists(pdf_path):
                pdf_reader = PdfReader(pdf_path)
                for page in pdf_reader.pages:
                    text += page.extract_text() or ""
            else:
                print(f"‚ö†Ô∏è PDF file not found: {pdf_path}")
        except Exception as e:
            print(f"‚ö†Ô∏è Error reading PDF {pdf_path}: {e}")
    return text

def get_text_chunks(text, chunk_size=1000, overlap=100):
    """Split text into manageable chunks"""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap
    )
    return splitter.split_text(text)

def clean_response(raw):
    """Clean and parse JSON response from LLM"""
    try:
        # Remove backticks and code block markers
        raw = raw.strip()
        if raw.startswith("```json"):
            raw = raw.replace("```json", "").replace("```", "").strip()
        elif raw.startswith("```"):
            raw = raw.replace("```", "").strip()

        # Try to find JSON array in the response
        start_idx = raw.find('[')
        end_idx = raw.rfind(']') + 1

        if start_idx != -1 and end_idx != 0:
            json_str = raw[start_idx:end_idx]
            return json.loads(json_str)
        else:
            return json.loads(raw)

    except Exception as e:
        print(f"‚ö†Ô∏è Failed to parse response: {e}")
        print(f"Raw response: {raw[:200]}...")
        return []

def generate_quiz_from_chunk(chunk, num_questions=2):
    """Generate quiz questions from a text chunk"""
    if len(chunk.strip()) < 100:  # Skip very short chunks
        return []

    prompt = f"""
Create {num_questions} multiple-choice quiz questions from the following text.
Each question must have exactly 4 options (A, B, C, D) and clearly indicate the correct answer.

IMPORTANT: Format your response as a valid JSON array only, like this example:
[
    {{
        "question": "What is the main concept discussed?",
        "options": {{
            "A": "Option 1",
            "B": "Option 2",
            "C": "Option 3",
            "D": "Option 4"
        }},
        "answer": "B"
    }}
]

Text to create questions from:
{chunk[:800]}
"""

    try:
        response = llm.invoke(prompt)
        questions = clean_response(response.content)

        # Validate questions format
        valid_questions = []
        for q in questions:
            if (isinstance(q, dict) and
                'question' in q and
                'options' in q and
                'answer' in q and
                isinstance(q['options'], dict) and
                len(q['options']) == 4 and
                q['answer'] in ['A', 'B', 'C', 'D']):
                valid_questions.append(q)

        return valid_questions
    except Exception as e:
        print(f"‚ö†Ô∏è Error generating quiz from chunk: {e}")
        return []

def generate_quiz(text, final_count=5):
    """Generate complete quiz from text"""
    if not text.strip():
        print("‚ö†Ô∏è No text extracted from PDFs")
        return []

    print(f"üìÑ Extracted {len(text)} characters from PDF(s)")

    chunks = get_text_chunks(text)
    print(f"üìö Created {len(chunks)} text chunks")

    quiz = []
    chunks_used = 0

    # Try to get questions from multiple chunks
    for chunk in chunks[:min(10, len(chunks))]:  # Try up to 10 chunks
        if len(quiz) >= final_count:
            break

        chunk_questions = generate_quiz_from_chunk(chunk, num_questions=1)
        if chunk_questions:
            quiz.extend(chunk_questions)
            chunks_used += 1
            print(f"‚úÖ Generated {len(chunk_questions)} question(s) from chunk {chunks_used}")

    # Shuffle and limit to final count
    if len(quiz) > final_count:
        quiz = random.sample(quiz, final_count)

    return quiz

# ===============================
# Main Interactive Quiz
# ===============================
def main():
    print("üéØ PDF Quiz Generator")
    print("=" * 50)

    # You can modify these paths or make them user input
    pdf_paths = ["/content/NOTES UNIT-4 ANN.pdf"]

    print("üìñ Reading PDF files...")
    raw_text = get_pdf_text(pdf_paths)

    if not raw_text.strip():
        print("‚ùå No text could be extracted from the PDFs. Please check the file paths.")
        return

    print("‚è≥ Generating Quiz Questions...")
    quiz = generate_quiz(raw_text, final_count=5)

    if not quiz:
        print("‚ùå No quiz questions could be generated. Please check your PDF content and API connection.")
        return

    print(f"\n‚úÖ Quiz Ready! Generated {len(quiz)} questions üöÄ")
    print("=" * 50)

    score = 0
    total_questions = len(quiz)

    for i, q in enumerate(quiz, 1):
        print(f"\nQuestion {i}/{total_questions}:")
        print(f"{q['question']}")
        print()

        # Display options
        for opt_key, opt_val in q['options'].items():
            print(f"  {opt_key}) {opt_val}")

        print()

        # Get user answer with validation
        while True:
            user_ans = input("Your answer (A/B/C/D): ").strip().upper()
            if user_ans in ['A', 'B', 'C', 'D']:
                break
            print("‚ö†Ô∏è Please enter A, B, C, or D")

        # Check answer
        if user_ans == q['answer']:
            print("‚úÖ Correct!")
            score += 1
        else:
            correct_option = q['options'][q['answer']]
            print(f"‚ùå Wrong! Correct Answer: {q['answer']}) {correct_option}")

        print("-" * 30)

    # Final score
    print("\n" + "=" * 50)
    print("üéØ Quiz Completed!")
    print(f"Your Final Score: {score}/{total_questions}")

    percentage = (score / total_questions) * 100
    if percentage >= 80:
        print("üèÜ Excellent work!")
    elif percentage >= 60:
        print("üëç Good job!")
    elif percentage >= 40:
        print("üìö Keep studying!")
    else:
        print("üí™ Don't give up, practice makes perfect!")

    print("=" * 50)

if __name__ == "__main__":
    main()

PLANNED TIMETABLE

pod cast

In [None]:
import os
import re
import tempfile
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceEmbeddings
from gtts import gTTS
import pyttsx3
from pydub import AudioSegment

# ===============================
# 1. Load PDF and Extract Text
# ===============================
def get_pdf_text(pdf_path):
    """Extract text from PDF file"""
    try:
        pdf_reader = PdfReader(pdf_path)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
        return text
    except Exception as e:
        print(f"‚ùå Error reading PDF: {e}")
        return ""

# ===============================
# 2. Split Text into Chunks
# ===============================
def get_chunks(text, chunk_size=1000, chunk_overlap=200):
    """Split text into manageable chunks"""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

# ===============================
# 3. Enhanced Podcast Generator
# ===============================
def generate_podcast(text_chunk, llm):
    """Original podcast generator"""
    prompt = f"""
    You are a podcast script writer.
    Convert the following study material into a podcast-style script
    between two people: Host and Guest.

    - The Host asks friendly, curious questions.
    - The Guest explains clearly with examples, analogies, and stories.
    - Keep it conversational, engaging, and easy to follow.
    - Do not just read the text, make it sound like real dialogue.
    - Format it clearly with Host: and Guest: labels for each speaker.

    Text:
    {text_chunk}
    """
    response = llm.invoke(prompt)
    return response.content

def generate_enhanced_podcast(text_chunk, llm, chunk_number=1, total_chunks=1):
    """Enhanced podcast script generator with better conversation flow"""
    prompt = f"""
    You are creating an educational podcast script. Convert this study material into
    natural dialogue between Sarah (Host) and Dr. Alex (Subject Expert).

    GUIDELINES:
    - Sarah asks thoughtful questions and provides transitions
    - Dr. Alex explains concepts clearly with real-world examples
    - Include natural conversation elements: "That's interesting...", "So what you're saying is..."
    - Break down complex topics into digestible parts
    - Add brief recap/preview if this is part {chunk_number} of {total_chunks}
    - Keep sentences conversational (not too long or academic)
    - Include pauses like "Um, let me think about that..." for naturalness

    STRUCTURE:
    - Start with Sarah introducing the topic
    - Use follow-up questions: "Can you elaborate on that?"
    - End with Sarah summarizing key points
    - Format as **Host:** and **Guest:** for clear speaker identification

    Study Material:
    {text_chunk}

    Create an engaging 3-4 minute podcast segment.
    """

    response = llm.invoke(prompt)
    return response.content

def add_podcast_intro_outro(script, topic_title="Study Session"):
    """Add professional intro and outro to the podcast"""
    intro = f"""**Host:** Welcome to StudyCast, where we turn your textbooks into conversations!
I'm Sarah, and today we're diving into {topic_title}.
With me is Dr. Alex, who's going to help us break this down. Ready to learn? Let's get started!

"""

    outro = """

**Host:** That was really insightful, Dr. Alex! Thanks for breaking that down for us.
**Guest:** My pleasure, Sarah. I hope this helps with your studies!
**Host:** And thank you for listening to StudyCast. Keep learning, and we'll catch you in the next episode!
"""

    return intro + script + outro

# ===============================
# 4. Clean Script for TTS
# ===============================
def clean_script_for_tts(script):
    """Remove speaker labels and formatting marks from the podcast script"""
    # Remove bold formatting markers
    cleaned = re.sub(r'\*\*([^*]+)\*\*', r'\1', script)

    # Remove speaker labels (Host:, Guest:, etc.)
    cleaned = re.sub(r'^(Host|Guest|Speaker \d+):\s*', '', cleaned, flags=re.MULTILINE)

    # Remove any remaining markdown formatting
    cleaned = re.sub(r'[*_`#]', '', cleaned)

    # Clean up extra whitespace and newlines
    cleaned = re.sub(r'\n\s*\n', '\n\n', cleaned)
    cleaned = cleaned.strip()

    return cleaned

# ===============================
# 5. Text-to-Speech Methods
# ===============================
def text_to_speech_with_voices(text, output_file="podcast_episode.wav"):
    """Uses pyttsx3 for different male/female voices"""
    clean_text = clean_script_for_tts(text)

    engine = pyttsx3.init()
    voices = engine.getProperty('voices')

    if not voices:
        print("‚ùå No voices found. Using default voice.")
        engine.save_to_file(clean_text, output_file)
        engine.runAndWait()
        return

    # Find male and female voices
    male_voice = None
    female_voice = None

    for voice in voices:
        voice_name = voice.name.lower()
        if any(keyword in voice_name for keyword in ['male', 'david', 'mark', 'alex']):
            male_voice = voice.id
        elif any(keyword in voice_name for keyword in ['female', 'zira', 'susan', 'samantha']):
            female_voice = voice.id

    # Fallback to first two voices if gender-specific not found
    if not male_voice:
        male_voice = voices[0].id
    if not female_voice:
        female_voice = voices[1].id if len(voices) > 1 else voices[0].id

    print(f"üé≠ Using voices - Male: {male_voice}, Female: {female_voice}")

    engine.setProperty('voice', male_voice)
    engine.setProperty('rate', 160)  # Adjust speech rate
    engine.save_to_file(clean_text, output_file)
    engine.runAndWait()

    print(f"‚úÖ Podcast audio saved as {output_file}")

def create_dual_voice_podcast(script, output_file="podcast_episode.wav"):
    """Create podcast with separate male (Host) and female (Guest) voices"""
    # Parse the script to separate Host and Guest parts
    parts = []
    current_text = ""
    current_speaker = None

    lines = script.split('\n')
    for line in lines:
        line = line.strip()
        if line.startswith('**Host:**'):
            if current_text and current_speaker:
                parts.append((current_speaker, current_text.strip()))
            current_speaker = 'host'
            current_text = line.replace('**Host:**', '').strip()
        elif line.startswith('**Guest:**'):
            if current_text and current_speaker:
                parts.append((current_speaker, current_text.strip()))
            current_speaker = 'guest'
            current_text = line.replace('**Guest:**', '').strip()
        elif line.startswith('Host:'):
            if current_text and current_speaker:
                parts.append((current_speaker, current_text.strip()))
            current_speaker = 'host'
            current_text = line.replace('Host:', '').strip()
        elif line.startswith('Guest:'):
            if current_text and current_speaker:
                parts.append((current_speaker, current_text.strip()))
            current_speaker = 'guest'
            current_text = line.replace('Guest:', '').strip()
        else:
            if line:
                current_text += ' ' + line

    # Add the last part
    if current_text and current_speaker:
        parts.append((current_speaker, current_text.strip()))

    # Initialize TTS engine
    engine = pyttsx3.init()
    voices = engine.getProperty('voices')

    if not voices:
        print("‚ùå No voices available. Using single voice.")
        text_to_speech_with_voices(script, output_file)
        return

    # Set up voices
    male_voice = voices[0].id
    female_voice = voices[1].id if len(voices) > 1 else voices[0].id

    print(f"üé≠ Male voice (Host): {voices[0].name}")
    print(f"üé≠ Female voice (Guest): {voices[1].name if len(voices) > 1 else 'Same as host'}")

    # Create temporary audio files for each part
    temp_files = []

    try:
        for i, (speaker, text) in enumerate(parts):
            if not text.strip():
                continue

            temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
            temp_files.append(temp_file.name)

            # Set voice based on speaker
            if speaker == 'host':
                engine.setProperty('voice', male_voice)
                engine.setProperty('rate', 160)  # Slightly faster for host
            else:  # guest
                engine.setProperty('voice', female_voice)
                engine.setProperty('rate', 150)  # Slightly slower for guest

            # Generate audio for this part
            engine.save_to_file(text, temp_file.name)
            engine.runAndWait()

            print(f"‚úÖ Generated {speaker} part {i+1}")

        # Combine all audio segments
        combined_audio = AudioSegment.empty()
        for temp_file in temp_files:
            if os.path.exists(temp_file):
                segment = AudioSegment.from_wav(temp_file)
                combined_audio += segment
                combined_audio += AudioSegment.silent(duration=500)  # Add 0.5s pause

        combined_audio.export(output_file, format="wav")
        print(f"‚úÖ Final podcast saved as {output_file}")

    except Exception as e:
        print(f"‚ùå Error creating dual voice podcast: {e}")
        print("üí° Falling back to single voice...")
        text_to_speech_with_voices(script, output_file)

    finally:
        # Clean up temporary files
        for temp_file in temp_files:
            if os.path.exists(temp_file):
                try:
                    os.unlink(temp_file)
                except:
                    pass

def create_gtts_dual_voice_podcast(script, output_file="podcast_gtts.mp3"):
    """Uses gTTS with different accents to simulate different voices"""
    # Parse script
    parts = []
    current_text = ""
    current_speaker = None

    lines = script.split('\n')
    for line in lines:
        line = line.strip()
        if line.startswith('**Host:**'):
            if current_text and current_speaker:
                parts.append((current_speaker, current_text.strip()))
            current_speaker = 'host'
            current_text = line.replace('**Host:**', '').strip()
        elif line.startswith('**Guest:**'):
            if current_text and current_speaker:
                parts.append((current_speaker, current_text.strip()))
            current_speaker = 'guest'
            current_text = line.replace('**Guest:**', '').strip()
        elif line.startswith('Host:'):
            if current_text and current_speaker:
                parts.append((current_speaker, current_text.strip()))
            current_speaker = 'host'
            current_text = line.replace('Host:', '').strip()
        elif line.startswith('Guest:'):
            if current_text and current_speaker:
                parts.append((current_speaker, current_text.strip()))
            current_speaker = 'guest'
            current_text = line.replace('Guest:', '').strip()
        else:
            if line:
                current_text += ' ' + line

    if current_text and current_speaker:
        parts.append((current_speaker, current_text.strip()))

    # Create audio segments
    temp_files = []

    try:
        for i, (speaker, text) in enumerate(parts):
            if not text.strip():
                continue

            temp_file = tempfile.NamedTemporaryFile(suffix='.mp3', delete=False)
            temp_files.append(temp_file.name)

            # Use different TTS settings for different voices
            if speaker == 'host':
                # Male voice simulation - use British English
                tts = gTTS(text=text, lang='en', tld='co.uk', slow=False)
            else:  # guest
                # Female voice simulation - use Australian English
                tts = gTTS(text=text, lang='en', tld='com.au', slow=False)

            tts.save(temp_file.name)
            print(f"‚úÖ Generated {speaker} part {i+1}")

        # Combine audio files
        combined_audio = AudioSegment.empty()
        for temp_file in temp_files:
            if os.path.exists(temp_file):
                segment = AudioSegment.from_mp3(temp_file)
                combined_audio += segment
                combined_audio += AudioSegment.silent(duration=800)  # Add pause

        combined_audio.export(output_file, format="mp3")
        print(f"‚úÖ Final podcast saved as {output_file}")

    except Exception as e:
        print(f"‚ùå Error combining audio: {e}")
        print("üí° Try installing: pip install pydub")
        # Fallback to single voice
        clean_text = clean_script_for_tts(script)
        tts = gTTS(text=clean_text, lang="en")
        tts.save(output_file)
        print(f"‚úÖ Single voice fallback saved as {output_file}")

    finally:
        # Clean up
        for temp_file in temp_files:
            if os.path.exists(temp_file):
                try:
                    os.unlink(temp_file)
                except:
                    pass

def create_background_music_podcast(script, output_file="podcast_with_music.mp3"):
    """Add subtle background music to the podcast"""
    try:
        # Create the main audio first
        create_gtts_dual_voice_podcast(script, "temp_podcast.mp3")

        # Load the podcast
        podcast = AudioSegment.from_mp3("temp_podcast.mp3")

        # Add a simple background tone (you could replace this with actual music)
        # This creates a very subtle background hum
        background = AudioSegment.sine(440, duration=len(podcast)).apply_gain(-30)

        # Mix the audio
        final_podcast = podcast.overlay(background)
        final_podcast.export(output_file, format="mp3")

        # Clean up
        if os.path.exists("temp_podcast.mp3"):
            os.unlink("temp_podcast.mp3")

        print(f"‚úÖ Podcast with background audio saved as {output_file}")

    except Exception as e:
        print(f"‚ùå Background music failed: {e}")
        print("üí° Creating podcast without background music...")
        create_gtts_dual_voice_podcast(script, output_file)

# ===============================
# 6. Podcast Series Generation
# ===============================
def create_full_podcast_series(pdf_path, llm, max_chunks=5):
    """Create a complete podcast series from a PDF"""
    # Load and process PDF
    raw_text = get_pdf_text(pdf_path)
    if not raw_text:
        print("‚ùå Could not extract text from PDF")
        return []

    chunks = get_chunks(raw_text, chunk_size=1500, chunk_overlap=300)

    # Limit chunks for demo
    chunks = chunks[:max_chunks]

    print(f"üìö Creating {len(chunks)} podcast episodes...")

    all_episodes = []

    for i, chunk in enumerate(chunks):
        print(f"\nüéôÔ∏è Generating Episode {i+1}/{len(chunks)}...")

        try:
            # Generate enhanced script
            episode_script = generate_enhanced_podcast(
                chunk, llm, chunk_number=i+1, total_chunks=len(chunks)
            )

            # Add intro/outro
            full_script = add_podcast_intro_outro(
                episode_script,
                topic_title=f"Study Material - Part {i+1}"
            )

            # Create audio
            episode_file = f"episode_{i+1:02d}.mp3"
            create_gtts_dual_voice_podcast(full_script, episode_file)

            all_episodes.append(episode_file)
            print(f"‚úÖ Episode {i+1} saved as {episode_file}")

        except Exception as e:
            print(f"‚ùå Error creating episode {i+1}: {e}")
            continue

    print(f"\nüéâ Created {len(all_episodes)} podcast episodes!")
    return all_episodes

# ===============================
# 7. Alternative: Generate Separate Audio Files
# ===============================
def create_multi_voice_podcast(script, output_dir="podcast_parts"):
    """Split the script by speakers and create separate audio files"""
    os.makedirs(output_dir, exist_ok=True)

    # Split by speakers
    parts = re.split(r'(\*\*(?:Host|Guest):\*\*)', script)

    current_speaker = None
    part_number = 1

    for i, part in enumerate(parts):
        if re.match(r'\*\*(Host|Guest):\*\*', part):
            current_speaker = re.search(r'\*\*(Host|Guest):\*\*', part).group(1).lower()
        elif part.strip() and current_speaker:
            clean_part = clean_script_for_tts(part)
            if clean_part:
                filename = f"{output_dir}/{part_number:02d}_{current_speaker}.mp3"
                try:
                    tts = gTTS(text=clean_part, lang="en")
                    tts.save(filename)
                    print(f"‚úÖ Created: {filename}")
                    part_number += 1
                except Exception as e:
                    print(f"‚ùå Error creating {filename}: {e}")

# ===============================
# 8. Utility Functions
# ===============================
def check_dependencies():
    """Check if all required packages are installed"""
    required_packages = {
        'PyPDF2': 'PyPDF2',
        'langchain': 'langchain',
        'langchain_groq': 'langchain-groq',
        'gtts': 'gTTS',
        'pyttsx3': 'pyttsx3',
        'pydub': 'pydub'
    }

    missing = []
    for package, pip_name in required_packages.items():
        try:
            __import__(package)
        except ImportError:
            missing.append(pip_name)

    if missing:
        print("‚ùå Missing packages:")
        for package in missing:
            print(f"   pip install {package}")
        return False
    return True

def list_available_voices():
    """List all available system voices"""
    try:
        engine = pyttsx3.init()
        voices = engine.getProperty('voices')

        if not voices:
            print("‚ùå No voices available")
            return

        print("üé≠ Available system voices:")
        for i, voice in enumerate(voices):
            print(f"   {i+1}. {voice.name} ({voice.id})")
    except Exception as e:
        print(f"‚ùå Error listing voices: {e}")

# ===============================
# 9. Main Function
# ===============================
def main():
    """Main function to run the podcast generator"""
    print("üéôÔ∏è PDF to Podcast Generator")
    print("=" * 50)

    # Check dependencies
    if not check_dependencies():
        print("\nüí° Please install missing packages and try again.")
        return

    # Setup API key
    GROQ_API_KEY = os.getenv("GROQ_API_KEY") or "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"
    if not GROQ_API_KEY:
        print("‚ùå Please set your GROQ_API_KEY as an environment variable")
        return

    # Setup LLM
    try:
        llm = ChatGroq(
            temperature=0.7,
            groq_api_key=GROQ_API_KEY,
            model_name="llama-3.3-70b-versatile"
        )
    except Exception as e:
        print(f"‚ùå Error setting up LLM: {e}")
        return

    # Get PDF path from user
    pdf_path = input("üìÅ Enter PDF file path (or press Enter for default): ").strip()
    if not pdf_path:
        pdf_path = "/content/NOTES UNIT-4 ANN.pdf"  # Default path

    if not os.path.exists(pdf_path):
        print(f"‚ùå PDF file not found: {pdf_path}")
        return

    # Load PDF and generate chunks
    print(f"\nüìñ Loading PDF: {pdf_path}")
    raw_text = get_pdf_text(pdf_path)

    if not raw_text:
        print("‚ùå Could not extract text from PDF")
        return

    chunks = get_chunks(raw_text)
    print(f"‚úÖ Created {len(chunks)} text chunks")

    # Choose operation mode
    print("\nüéØ Choose your mode:")
    print("1. Single episode (first chunk only)")
    print("2. Full podcast series (multiple episodes)")
    print("3. List available voices")

    mode = input("Enter choice (1-3) or press Enter for default (1): ").strip() or "1"

    if mode == "3":
        list_available_voices()
        return

    if mode == "2":
        max_episodes = input("How many episodes? (1-10, default 3): ").strip()
        try:
            max_episodes = int(max_episodes) if max_episodes else 3
            max_episodes = min(max_episodes, 10)  # Limit to 10
        except:
            max_episodes = 3

        print(f"\nüé¨ Creating {max_episodes} episodes...")
        episodes = create_full_podcast_series(pdf_path, llm, max_episodes)
        if episodes:
            print(f"\nüéâ Successfully created {len(episodes)} episodes!")
            for ep in episodes:
                print(f"   üìÅ {ep}")
        return

    # Single episode mode
    print("\nüéôÔ∏è Generating single episode from first chunk...")

    # Generate podcast script
    podcast_script = generate_enhanced_podcast(chunks[0], llm)
    full_script = add_podcast_intro_outro(podcast_script, "Study Material")

    print("\nüìù Generated Podcast Script:")
    print("=" * 50)
    print(full_script[:500] + "..." if len(full_script) > 500 else full_script)
    print("=" * 50)

    # Automatically use dual voice with gTTS (male British + female Australian)
    print("\nüé≠ Creating podcast with dual voices:")
    print("   üßë Host (Sarah) - British English accent")
    print("   üë© Guest (Dr. Alex) - Australian English accent")

    try:
        create_gtts_dual_voice_podcast(full_script, "podcast_gtts_dual.mp3")

    except Exception as e:
        print(f"‚ùå Error generating audio: {e}")
        print("üí° Make sure all dependencies are installed properly")

    print("\nüéâ Podcast generation complete!")
    print("\nüí° Tips for better results:")
    print("- Use PDFs with clear, well-formatted text")
    print("- Install pydub for audio combination features")
    print("- Try different voice methods for best quality")

# ===============================
# 10. Entry Point
# ===============================
if __name__ == "__main__":
    main()

SUMmarizer

In [None]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceEmbeddings

# ===============================
# Setup Groq LLM
# ===============================
llm = ChatGroq(
    temperature=0,
    groq_api_key="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",   # <-- replace with your key
    model_name="meta-llama/llama-4-scout-17b-16e-instruct"
)

# ===============================
# Helper Functions
# ===============================
def load_pdf_text(pdf_paths):
    """Extract text from multiple PDFs."""
    text = ""
    for path in pdf_paths:
        reader = PdfReader(path.strip())
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def chunk_text(text):
    """Split text into chunks for embeddings."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )
    return splitter.split_text(text)

def build_vectorstore(chunks):
    """Create FAISS vector DB from text chunks."""
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return FAISS.from_texts(chunks, embedding=embeddings)

# ===============================
# Summarizer
# ===============================
def summarize_pdf(pdf_paths, detail=False):
    # 1. Load PDF
    text = load_pdf_text(pdf_paths)
    chunks = chunk_text(text)
    vectorstore = build_vectorstore(chunks)

    # 2. Create QA chain
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        chain_type="stuff"
    )

    # 3. Ask summarization
    if detail:
        query = "Summarize this document in detail with main topics and bullet points."
    else:
        query = "Give a short summary of this document in one paragraph."

    return qa.run(query)

# ===============================
# Run
# ===============================
if __name__ == "__main__":
    pdf_input = input("Enter PDF file paths (comma separated): ")
    pdf_paths = [p.strip() for p in pdf_input.split(",")]

    print("üìñ Generating Summary...\n")

    short_summary = summarize_pdf(pdf_paths, detail=False)
    detailed_summary = summarize_pdf(pdf_paths, detail=True)

    print("‚úÖ Short Summary:\n", short_summary, "\n")
    print("‚úÖ Detailed Summary:\n", detailed_summary)
