In [2]:
import gradio as gr
import faiss
import numpy as np
import ollama
import re
from pptx import Presentation
from gtts import gTTS
import tempfile
from sentence_transformers import SentenceTransformer

# Load Sentence Transformer for embedding text
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Global variables
slide_texts = []
index = None  # FAISS Index

# Function to clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces and newlines
    text = re.sub(r'[^\w\s.,!?-]', '', text)  # Remove non-alphabetic characters (except punctuation)
    text = text.strip()  # Remove leading/trailing spaces
    return text

# Function to extract, clean, and store slide text in FAISS
def extract_text_from_ppt(ppt_file):
    global slide_texts, index
    
    prs = Presentation(ppt_file.name)
    slide_texts = []  # Reset previous texts

    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                cleaned_text = clean_text(shape.text.strip())  # Clean text before storing
                slide_text.append(cleaned_text)
        slide_texts.append("\n".join(slide_text))

    if not slide_texts:
        return "❌ No useful text found in the slides.", "", None, 0

    # Convert text to embeddings
    embeddings = embedding_model.encode(slide_texts)
    d = embeddings.shape[1]  # Get embedding dimensions

    # Create FAISS index
    index = faiss.IndexFlatL2(d)
    index.add(np.array(embeddings, dtype=np.float32))

    # ✅ Generate elaborated content & voice for the first slide (Introduction)
    first_slide_text = slide_texts[0]
    elaborated_text = generate_presentation_speech(first_slide_text, slide_number=1)
    speech_audio = text_to_speech(elaborated_text)

    return first_slide_text, elaborated_text, speech_audio, 1, len(slide_texts)

# Function to generate elaborated slide speech (Customized for First & Last Slide)
def generate_presentation_speech(text, slide_number):
    total_slides = len(slide_texts)

    if slide_number == 1:  # First Slide (Introduction)
        prompt = "You are an AI presenter introducing the topic professionally. Provide a strong introduction, explaining the importance of the subject. Avoid listing points directly; make it engaging. Do not start with phrases like 'Here's a structured speech.' Just start speaking naturally."
    
    elif slide_number == total_slides:  # Last Slide (Conclusion)
        prompt = "You are an AI presenter giving a structured, impactful conclusion. Summarize the key points covered in the presentation and leave the audience with a strong final message. Do not start with unnecessary phrases. Just begin naturally."
    
    else:  # All other slides
        prompt = "You are an AI presenter explaining the slide's content in a professional manner. Do not greet the audience repeatedly. Ensure smooth transitions between slides. Do not start with phrases like 'Here’s a structured speech.' Just present the content directly."

    response = ollama.chat(model="llama3", messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": f"Convert this slide content into a structured speech:\n{text}"}
    ])

    return response['message']['content']

# Function to generate speech from text
def text_to_speech(text):
    if not text.strip():
        return None

    tts = gTTS(text, lang="en")
    temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    tts.save(temp_audio.name)

    return temp_audio.name  # Return file path

# Function to process slides one by one
def process_slide(slide_number):
    total_slides = len(slide_texts)
    
    # Prevent invalid slide numbers
    if slide_number < 1:
        slide_number = 1
    elif slide_number > total_slides:
        slide_number = total_slides

    slide_text = slide_texts[slide_number - 1]
    elaborated_text = generate_presentation_speech(slide_text, slide_number)
    speech_audio = text_to_speech(elaborated_text)  # Generate voice

    return slide_text, elaborated_text, speech_audio, slide_number  # Ensure slide number updates in UI

# Function to retrieve relevant slide content for Q&A
def retrieve_context(query):
    if index is None or len(slide_texts) == 0:
        return "❌ No slides indexed. Please upload a PPT first."

    query_embedding = embedding_model.encode([query])
    _, idx = index.search(np.array(query_embedding, dtype=np.float32), k=3)

    # Retrieve the most relevant slides
    retrieved_texts = [slide_texts[i] for i in idx[0]]
    return "\n".join(retrieved_texts)

# ✅ Function to generate answers using Llama 3
def answer_question(query):
    context = retrieve_context(query)

    response = ollama.chat(model="llama3", messages=[
        {"role": "system", "content": "You are an AI assistant specialized in answering questions from presentation slides."},
        {"role": "user", "content": f"Based on this slide content:\n\n{context}\n\nAnswer this question: {query}"}
    ])

    answer_text = response['message']['content']
    answer_audio = text_to_speech(answer_text)  # Generate voice

    return answer_text, answer_audio

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# 🎤 AI-Powered Slide Presenter with FAISS RAG Q&A + Voice")

    # Upload PPT and process slides
    with gr.Row():
        ppt_upload = gr.File(label="Upload PPTX File")
        ppt_button = gr.Button("Process PPT")

    slide_text = gr.Textbox(label="Slide Content", interactive=False)
    elaborated_text = gr.Textbox(label="Presentation Speech", interactive=False)
    speech_output = gr.Audio(label="Speech Audio")
    slide_number = gr.Number(value=1, label="Current Slide", interactive=False)  # Updated to non-editable

    # Slide navigation buttons
    with gr.Row():
        prev_button = gr.Button("Previous Slide")
        next_button = gr.Button("Next Slide")

    total_slides = gr.Number(value=0, label="Total Slides", interactive=False)

    # ✅ Re-added RAG Q&A Interface
    gr.Markdown("### 🤖 Ask Questions About the Slides (FAISS RAG Q&A)")
    user_query = gr.Textbox(label="Your Question")
    answer_text = gr.Textbox(label="AI Answer", interactive=False)
    answer_audio = gr.Audio(label="Answer Audio")
    query_button = gr.Button("Get Answer")

    # ✅ Slide Processing & Navigation
    ppt_button.click(extract_text_from_ppt, inputs=ppt_upload, 
                     outputs=[slide_text, elaborated_text, speech_output, slide_number, total_slides])

    prev_button.click(lambda n: process_slide(n - 1), inputs=slide_number, 
                      outputs=[slide_text, elaborated_text, speech_output, slide_number])

    next_button.click(lambda n: process_slide(n + 1), inputs=slide_number, 
                      outputs=[slide_text, elaborated_text, speech_output, slide_number])

    # ✅ Q&A Functionality Restored
    query_button.click(answer_question, inputs=user_query, outputs=[answer_text, answer_audio])

demo.launch()


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


