<a href="https://colab.research.google.com/github/Flizyx/LLM-Langchain-Chunking-summarizer/blob/main/AI_Engineer_Test_Generative_Transcript_Transformation_(chunking_openai).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- 1. Install Dependencies ---

#!pip install -q langchain transformers pypdf PyPDF2 gradio langchain-community
!pip install -q langchain openai tiktoken pypdf PyPDF2 gradio langchain-community

In [None]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "YOUR_API_KEY"
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"

In [None]:
# Hello

In [None]:
import gradio as gr
from langchain.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage


#######################################
# 2) Initialize ChatOpenAI
#######################################
llm = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model_name="gpt-3.5-turbo",
    temperature=0.7,
    max_tokens=3000  # Increase for longer outputs
)

#######################################
# 3) Prompts
#######################################
SUMMARY_PROMPT = """
You are a world-class summarizer. Summarize the following text, highlighting the key points while retaining important details.
Focus on capturing everything needed for a future teaching transcript. Avoid unnecessary fluff, but don't leave out key insights.
"""

PROMPT_TITLE_INTRO = """
You are tasked with writing the first part of a structured teaching transcript:
1) A compelling Title
2) An engaging Introduction

Use the text below for context, highlighting relevant details.
Provide a strong hook and clarify the purpose of this lecture.
Aim for thoroughness (at least a few hundred words).
"""

PROMPT_INTRO_OVERVIEW = """
Now generate the second part of this teaching transcript:
Introductory Overview of the main topic.

Use the text below for context.
Explain background, importance, and scope in a structured, engaging manner.
Aim for depth and clarity.
"""

PROMPT_DETAILED_EXPLANATION = """
Generate the third part of the teaching transcript:
A Detailed Explanation of Key Topics.

Use the text below to dive deeper, present subtopics,
and cover important points in a systematic way.
Include enough detail for a comprehensive understanding.
"""

PROMPT_PRACTICAL_EXAMPLES = """
Generate the fourth part of the teaching transcript:
Practical Examples and Solutions.

Use the text below to illustrate real-world examples,
use cases, and best practices.
Highlight practical strategies or solutions relevant to the topic.
"""

PROMPT_SUMMARY_NEXTSTEPS = """
Finally, generate the fifth part of the transcript:
Summary and Next Steps.

Use the text below to summarize key takeaways,
and provide guidance or next steps for students to continue learning.
"""

ENRICH_PROMPT = """
We have a teaching transcript that isn't quite long enough. Expand it into a more detailed lecture:
Aim for 3900+ words. Add further examples, deeper explanations, relevant stories, or advanced topics.
"""

#######################################
# 4) State Variables for Gradio
#######################################
parsed_chunks_state = gr.State([])        # Holds chunked documents
summary_state = gr.State("")             # Holds combined summary text
teaching_transcript_state = gr.State("") # Holds the final teaching transcript

#######################################
# 5) Helper Functions
#######################################
def chatgpt_call(system_prompt: str, user_prompt: str) -> str:
    messages = [
        SystemMessage(content=system_prompt.strip()),
        HumanMessage(content=user_prompt.strip())
    ]
    response = llm(messages)
    return response.content.strip()

def parse_and_chunk(pdf_file):
    """Load the PDF and split into chunked documents."""
    loader = PyPDFLoader(pdf_file.name)
    raw_docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=500
    )
    chunked_docs = text_splitter.split_documents(raw_docs)

    print(f"[INFO] Loaded {len(raw_docs)} PDF pages.")
    print(f"[INFO] Created {len(chunked_docs)} chunks.")
    for i, doc in enumerate(chunked_docs[:3]):
        print(f"[DEBUG] Sample Chunk {i+1} length: {len(doc.page_content.split())} words")

    return chunked_docs

def generate_summary(chunks):
    """
    Summarize each chunk individually, then concatenate those partial summaries
    into a single "combined text." No extra summarization pass, so we retain detail.
    """
    partial_summaries = []
    for doc in chunks:
        text = doc.page_content.strip()
        if not text:
            continue
        chunk_summary = chatgpt_call(SUMMARY_PROMPT, text)
        partial_summaries.append(chunk_summary)

    combined_text = "\n".join(partial_summaries)
    print("[DEBUG] Combined summary length:", len(combined_text.split()), "words")
    return combined_text

########################################
# Splitting Text (Subchunks) for Map-Reduce
########################################
def split_text_into_subchunks(text: str, chunk_size=3000, overlap=500) -> list:
    subchunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        subchunk = text[start:end]
        subchunks.append(subchunk)
        start += chunk_size - overlap
    return subchunks

def generate_section_chunked(full_text: str, system_prompt: str) -> str:
    """
    Break the text into sub-chunks, produce partial outputs, then unify them.
    """
    chunk_size = 3000
    overlap = 500
    subchunks = split_text_into_subchunks(full_text, chunk_size, overlap)

    partial_outputs = []
    for i, subchunk in enumerate(subchunks):
        partial_result = chatgpt_call(system_prompt, subchunk)
        partial_outputs.append(partial_result)

    joined_text = "\n".join(partial_outputs)
    unify_prompt = f"""
We have multiple partial outputs for this section:
'{system_prompt.strip()}'
Please unify them into one coherent section without omitting important details,make it long, dont summarize but enrichen.
"""
    unified_section = chatgpt_call(unify_prompt, joined_text)
    return unified_section

def generate_structured_transcript(full_combined_text: str) -> str:
    """
    Create a 5-part structured teaching transcript, each part chunked if needed.
    Then we simply return the concatenated sections (no final unify pass).
    """
    print("[INFO] Generating Title & Intro...")
    section1 = generate_section_chunked(full_combined_text, PROMPT_TITLE_INTRO)

    print("[INFO] Generating Introductory Overview...")
    section2 = generate_section_chunked(full_combined_text, PROMPT_INTRO_OVERVIEW)

    print("[INFO] Generating Detailed Explanation...")
    section3 = generate_section_chunked(full_combined_text, PROMPT_DETAILED_EXPLANATION)

    print("[INFO] Generating Practical Examples & Solutions...")
    section4 = generate_section_chunked(full_combined_text, PROMPT_PRACTICAL_EXAMPLES)

    print("[INFO] Generating Summary & Next Steps...")
    section5 = generate_section_chunked(full_combined_text, PROMPT_SUMMARY_NEXTSTEPS)

    # Combine them directly (NO final unify pass => keep more detail).
    all_sections = "\n\n".join([section1, section2, section3, section4, section5])
    print("[DEBUG] All sections length:", len(all_sections.split()), "words")

    return all_sections

########################################
# Chunked Enrichment
########################################
def enrich_transcript_chunked(transcript: str) -> str:
    """
    Break the final transcript into sub-chunks, enrich each chunk, then unify them.
    This avoids a single huge prompt that might truncate.
    """
    chunk_size = 3000
    overlap = 500
    subchunks = split_text_into_subchunks(transcript, chunk_size, overlap)

    partial_enrichments = []
    for i, sc in enumerate(subchunks):
        # "Map" step: Enrich each partial chunk
        partial = chatgpt_call(ENRICH_PROMPT, sc)
        partial_enrichments.append(partial)

    # "Reduce" step: unify partial enriched chunks
    joined_enriched = "\n".join(partial_enrichments)
    return joined_enriched

#######################################
# 6) Gradio Event Handlers
#######################################
def step1_parse(pdf):
    print("[INFO] Step 1: Parsing & Chunking started...")
    chunks = parse_and_chunk(pdf)
    parsed_chunks_state.value = chunks
    print("[INFO] Step 1: Parsing & Chunking finished.")
    return (
        f"**Step 1 Results**\n\n"
        f"- Successfully parsed PDF\n"
        f"- Found **{len(chunks)}** chunks in total."
    )

def step2_summary():
    print("[INFO] Step 2: Summarization started...")
    chunks = parsed_chunks_state.value
    if not chunks:
        print("[ERROR] No chunks found.")
        return "No chunks found. Please parse a PDF first."

    summary_text = generate_summary(chunks)
    summary_state.value = summary_text

    word_count = len(summary_text.split())
    print("[INFO] Step 2: Summarization finished. Word Count:", word_count)

    return (
        f"**Initial Combined Text**\n\n{summary_text}\n\n"
        f"**Word Count:** {word_count}\n\n"
        "Use Step 3 to build a multi-step teaching transcript from this text."
    )

def step3_teaching():
    # Use the large, combined text
    combined_text = summary_state.value
    if not combined_text:
        print("[ERROR] No combined text found.")
        return "No combined text found. Please run Step 2 first."

    print("[INFO] Generating structured teaching transcript (5 sections, chunked)...")
    transcript = generate_structured_transcript(combined_text)
    teaching_transcript_state.value = transcript

    word_count = len(transcript.split())
    out_text = (
        f"**Teaching Transcript (~30 minutes)**\n\n{transcript}\n\n"
        f"**Word Count:** {word_count}\n"
    )
    if word_count < 3900:
        out_text += "\nIt looks shorter than 3900 words. You can click 'Enrich Transcript' to expand it."
    else:
        out_text += "\nGreat! This is around or above 3900 words."
    return out_text

def step4_enrich():
    print("[INFO] Step 4: Enrichment started...")
    transcript = teaching_transcript_state.value
    if not transcript:
        print("[ERROR] No teaching transcript found.")
        return "No teaching transcript found. Please run Step 3 first."

    # Perform chunked enrichment
    enriched = enrich_transcript_chunked(transcript)
    teaching_transcript_state.value = enriched

    word_count = len(enriched.split())
    print("[INFO] Step 4: Enrichment finished. Word Count:", word_count)
    out_text = (
        f"**Enriched Teaching Transcript**\n\n{enriched}\n\n"
        f"**Word Count:** {word_count}\n"
    )
    if word_count < 3900:
        out_text += "\nStill less than 3900 words. You can enrich again or tweak your prompt."
    else:
        out_text += "\nNow it's at or above 3900 words!"
    return out_text

#######################################
# 7) Gradio Interface
#######################################
with gr.Blocks() as demo:
    gr.Markdown("# Transform Transcript into Teaching Material (Multi-Section, No Final Summarize)")
    gr.Markdown(
        "1) Parse & chunk PDF\n"
        "2) Generate an initial combined text (partial summaries joined)\n"
        "3) Generate a multi-step teaching transcript (~30 mins) in 5 sections\n"
        "4) Enrich the final transcript (chunked) if under 3900 words"
    )

    pdf_input = gr.File(label="Upload PDF Transcript")

    parse_button = gr.Button("Step 1: Parse & Chunk")
    parse_output = gr.Markdown()
    parse_button.click(fn=step1_parse, inputs=pdf_input, outputs=parse_output)

    summary_button = gr.Button("Step 2: Generate Combined Text")
    summary_output = gr.Markdown()
    summary_button.click(fn=step2_summary, inputs=None, outputs=summary_output)

    teaching_button = gr.Button("Step 3: Build Structured Transcript (~30 min)")
    teaching_output = gr.Markdown()
    teaching_button.click(fn=step3_teaching, inputs=None, outputs=teaching_output)

    enrich_button = gr.Button("Step 4: Enrich (Chunked) If Under 3900 Words")
    enrich_output = gr.Markdown()
    enrich_button.click(fn=step4_enrich, inputs=None, outputs=enrich_output)

demo.launch(debug=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://daf4db8a59c60c40cf.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


[INFO] Step 1: Parsing & Chunking started...
[INFO] Loaded 13 PDF pages.
[INFO] Created 25 chunks.
[DEBUG] Sample Chunk 1 length: 405 words
[DEBUG] Sample Chunk 2 length: 113 words
[DEBUG] Sample Chunk 3 length: 422 words
[INFO] Step 1: Parsing & Chunking finished.
[INFO] Step 2: Summarization started...
[DEBUG] Combined summary length: 2805 words
[INFO] Step 2: Summarization finished. Word Count: 2805
[INFO] Generating structured teaching transcript (5 sections, chunked)...
[INFO] Generating Title & Intro...
[INFO] Generating Introductory Overview...
[INFO] Generating Detailed Explanation...
[INFO] Generating Practical Examples & Solutions...
[INFO] Generating Summary & Next Steps...
[DEBUG] All sections length: 2114 words
[INFO] Step 4: Enrichment started...
[INFO] Step 4: Enrichment finished. Word Count: 8882
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://daf4db8a59c60c40cf.gradio.live


