# Install the libraries 

In [11]:
%pip install PyPDF2
%pip install python-dotenv
%pip install -q -U google-generativeai

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# We import the necessary libraries

In [12]:
import os
import PyPDF2
from dotenv import load_dotenv 
import google.generativeai as genai

In [13]:
load_dotenv()

True

In [14]:
API_KEY_GOOGLE= os.getenv("API_KEY_GOOGLE_GEMINI")

# We read and adjust to obtain the pdf and then they are transformed into .txt

In [15]:
document_folder = './Docs/origin/'
try:
    pdf_files = [os.path.join(document_folder, f) for f in os.listdir(document_folder) if f.endswith('.pdf')]
    txt_files = [os.path.join(document_folder, f) for f in os.listdir(document_folder) if f.endswith('.txt')]
    documents_files = pdf_files + txt_files
except:
    raise ValueError("files don't found")

In [16]:
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            for page in reader.pages:
                text += page.extract_text()
        
       # Clean and process the text to remove unnecessary line breaks
        text = " ".join(text.splitlines())  # Combine the lines into a single paragraph
        text = " ".join(text.split())      # Remove multiple spaces
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
    return text

for file_path in documents_files:
    if file_path.endswith('.pdf'):
        print(f"Processing PDF file: {file_path}")
        extracted_text = extract_text_from_pdf(file_path)
        print(f"Extracted content (first 50 characters):\n{extracted_text[:50]}")
    elif file_path.endswith('.txt'):
        print(f"TXT file found: {file_path}")
        # Read the contents of the TXT file
        with open(file_path, 'r', encoding='utf-8') as txt_file:
            text = txt_file.read() 
        print(f"Extracted content (first 50 characters)::\n{text[:50]}")

Processing PDF file: ./Docs/origin/1.pdf


Extracted content (first 50 characters):
Site Reliability Engineering at Google Taken from:


In [17]:
input_folder = "./Docs/extracted"
os.makedirs(input_folder, exist_ok=True)
for file_path in documents_files:
    extracted_text = extract_text_from_pdf(file_path)
    output_file = os.path.join(input_folder, os.path.basename(file_path).replace('.pdf', '.txt'))
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(extracted_text)
    print(f"Text saved in: {output_file}")


Text saved in: ./Docs/extracted\1.txt


# text processing

In [18]:
genai.configure(api_key=API_KEY_GOOGLE)

model = genai.GenerativeModel("gemini-1.5-flash")

output_folder = "./Docs/pre-processing"  # Carpeta para guardar resultados
os.makedirs(output_folder, exist_ok=True)

def process_txt_file(file_path):
    try:
        # Leer el contenido del archivo .txt
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_text = f.read()

        # Prompt para estructurar el contenido
        prompt = (
            "Organize and improve the following text into structured sections: "
            "Introduction, Development, and Conclusion. Ensure it is clean, "
            "detailed, and suitable for educational purposes:\n\n"
            f"{raw_text}"
        )

      
        response = model.generate_content([prompt])
        return response.text
    except Exception as e:
        print(f"Error procesando {file_path}: {e}")
        return None

# Procesar cada archivo .txt en la carpeta de entrada
for file_name in os.listdir(input_folder):
    if file_name.endswith('.txt'):
        input_path = os.path.join(input_folder, file_name)
        output_path = os.path.join(output_folder, file_name)

        print(f"Processing: {file_name}")
        processed_text = process_txt_file(input_path)

        # Save the result if there are no errors
        if processed_text:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(processed_text)
            print(f"Generation saved in: {output_path}")
        else:
            print(f"The file could not be processed: {file_name}")


Processing: 1.txt
Generation saved in: ./Docs/pre-processing\1.txt


In [21]:
output_folder_1 = "./Docs/educational_transcript_1"  # Carpeta para guardar transcripciones
os.makedirs(output_folder_1, exist_ok=True)

# Class duration (options: '30' or '60' minutes)
lecture_duration = "60" # Change to "60" if a longer class is desired

# Function to generate the educational transcript
def create_educational_transcription(file_path):
    try:
       # Read the contents of the preprocessed file# Read the contents of the preprocessed file
        with open(file_path, 'r', encoding='utf-8') as f:
            preprocessed_text = f.read()

        # Prompt to generate educational transcript
        prompt = (
            f"Using the following text, create a detailed and structured transcript suitable for a "
            f"{lecture_duration}-minute lecture. The output should include: \n"
            "1. An introduction to the topic.\n"
            "2. A detailed explanation of key points, with examples.\n"
            "3. A conclusion with reflections or actionable steps for students.\n\n"
            "Additionally, add a section at the end titled 'Activities' with tasks or questions "
            "to engage students and reinforce their understanding of the topic.\n\n"
            f"Input Text:\n{preprocessed_text}"
        )

        # Generar contenido con el modelo
        response = model.generate_content([prompt])
        return response.text
    except Exception as e:
        print(f"Error procesando {file_path}: {e}")
        return None

# Procesar cada archivo preprocesado en la carpeta de entrada
for file_name in os.listdir(output_folder):
    if file_name.endswith('.txt'):
        input_path = os.path.join(output_folder, file_name)
        output_path = os.path.join(output_folder_1, file_name)

        print(f"Generating educational transcript for: {file_name}")
        educational_transcription = create_educational_transcription(input_path)

       
        if educational_transcription:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(educational_transcription)
            print(f"Transcript saved in: {output_path}")
        else:
            print(f"The transcript could not be generated for: {file_name}")


Generating educational transcript for: 1.txt
Transcript saved in: ./Docs/educational_transcript_1\1.txt


In [22]:
# Setting the duration (choose between "30" or "60")
duration = "30"  # Change to "60" for a 60 minute class
output_folder_2 = "./Docs/educational_transcript_2"  # Carpeta para guardar transcripciones
os.makedirs(output_folder_2, exist_ok=True)


base_prompt = {
    "30": (
        "Using the following text, create a detailed and structured transcript suitable for a 30-minute lecture. "
        "The output must include:\n"
        "1. An engaging introduction to the topic.\n"
        "2. Detailed explanations of key points, supported by examples.\n"
        "3. A conclusion with reflections and actionable next steps for students.\n"
        "4. Activities for students, such as questions, practical exercises, or case studies.\n\n"
        "Text:\n{content}"
    ),
    "60": (
        "Using the following text, create a detailed and structured transcript suitable for a 60-minute lecture. "
        "The output must include:\n"
        "1. An engaging and comprehensive introduction to the topic.\n"
        "2. Thorough explanations of key points, supported by detailed examples and case studies.\n"
        "3. A conclusion with reflections, future learning paths, and actionable next steps for students.\n"
        "4. Activities for students, including in-depth questions, group exercises, or real-world case studies.\n\n"
        "Text:\n{content}"
    ),
}


def create_educational_transcription(file_path, duration):
    try:

        with open(file_path, 'r', encoding='utf-8') as f:
            preprocessed_text = f.read()

        prompt = base_prompt[duration].format(content=preprocessed_text)

        response = model.generate_content([prompt])
        return response.text
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


for file_name in os.listdir(output_folder):
    if file_name.endswith('.txt'):
        input_path = os.path.join(output_folder, file_name)
        output_path = os.path.join(output_folder_2, file_name)

        print(
            f"Generating educational transcript for: {file_name} ({duration}-minute lecture)")
        educational_transcription = create_educational_transcription(
            input_path, duration)

        if educational_transcription:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(educational_transcription)
            print(f"Transcript saved to: {output_path}")
        else:
            print(f"Failed to generate transcript for: {file_name}")

Generating educational transcript for: 1.txt (30-minute lecture)
Transcript saved to: ./Docs/educational_transcript_2\1.txt
