In [1]:
import sys
import os

# Add the project directory to the path
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
project_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(project_dir)

#from src.ollama_utils import ollama_quick_chat
from src.agents import OpenAIVanillaAgent

import json
import time
import re

from dotenv import load_dotenv
load_dotenv()

from markdownify import markdownify
import pdfkit

from bs4 import BeautifulSoup

path_to_wkhtmltopdf = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"  # Ajusta esta ruta si es necesario
config = pdfkit.configuration(wkhtmltopdf=path_to_wkhtmltopdf)

In [2]:
# Load lecture transcripts

lecture_transcripts = {}

for txt_file in ["TL1 - Full", "TL2 - Full", "TL3 - Full", "TL4 - Full"]:
    with open(f"../data/{txt_file}.txt", "r") as file:
        lecture_transcripts[txt_file] = file.read()

In [3]:
# Extract lecture titles (Defined in the first line of the transcript ".txt" files)

lecture_titles = {}

for txt_file, transcript in lecture_transcripts.items():
    lecture_titles[txt_file] = re.match("(.*?)(\n|$)", transcript).group(1)

In [15]:
# Load prompts

with open("../data/prompts_short.json", "r") as f:
    prompts = json.load(f)

task = prompts["tasks"][0]

user_prompt = task["user_prompt"].format(transcript="This is the transcript content.")
print(user_prompt)

Consider the provided text transcript of the video-reccording of one of the lectures of the course 'Linguistics & AI' from my master's program in AI. Elaborate a detailed summary of the lecture's content. The final text's should have a textbook-like style, suitable to be used as study material. Also, provide it as a single unified text, avoiding the use of multiple headers or segmenting it. Do not include any sort of title or subtitle either, provide just the text summary. TRANSCRIPT: This is the transcript content.


In [5]:
# Conduct the tasks (OAI)

output = {}

for txt_file, transcript in lecture_transcripts.items():

    if txt_file != "TL3 - Full":

        continue

    print("Working on: ", txt_file, "...")

    tmp = {}
    
    for task in range(len(prompts["tasks"])):

        print("        ...conducting task: ", task)

        if task not in [3, 4]:

            system_prompt = prompts["tasks"][task]["system_prompt"]
            user_prompt = prompts["tasks"][task]["user_prompt"].format(transcript=transcript)

            tmp[task] = OpenAIVanillaAgent(system_prompt, user_prompt).completion()

        elif task == 3:

            tmp_html = {}

            for prev_task in [0,1,2]:

                system_prompt = prompts["tasks"][task]["system_prompt"]
                user_prompt = prompts["tasks"][task]["user_prompt"].format(text=tmp[prev_task], title=f"{lecture_titles[txt_file]} - {prompts['tasks'][prev_task]['name']}")

                tmp_html[f"{prev_task}"] = OpenAIVanillaAgent(system_prompt, user_prompt).completion()[9:-3]

            tmp[task] = tmp_html

    output[txt_file] = tmp
    #break

Working on:  TL3 - Full ...
        ...conducting task:  0
        ...conducting task:  1
        ...conducting task:  2
        ...conducting task:  3


In [6]:
## Conduct the tasks (Ollama)
#
#output = {}
#
#for transcript_name, transcript in lecture_transcripts.items():
#
#    print("Working on: ", transcript_name, "...")
#
#    tmp = {}
#    
#    for task in range(len(prompts["tasks"])):
#
#        print("        ...conducting task: ", task)
#
#        if task in [0, 1, 2, 4]:
#
#            system_prompt = prompts["tasks"][task]["system_prompt"]
#            user_prompt = prompts["tasks"][task]["user_prompt"].format(transcript=transcript)
#
#            tmp[task] = ollama_quick_chat(system_message=system_prompt, user_input=user_prompt, llm="llama3.3:70b")["message"]["content"]
#
#        elif task == 3:
#
#            system_prompt = prompts["tasks"][task]["system_prompt"]
#            user_prompt = prompts["tasks"][task]["user_prompt"].format(transcript=transcript, concepts=tmp[2])
#
#            tmp[task] = ollama_quick_chat(system_message=system_prompt, user_input=user_prompt, llm="llama3.3:70b")["message"]["content"]
#
#        elif task == 5:
#
#            system_prompt = prompts["tasks"][task]["system_prompt"]
#            user_prompt = prompts["tasks"][task]["user_prompt"].format(transcript=transcript, questions=tmp[4])
#
#            tmp[task] = ollama_quick_chat(system_message=system_prompt, user_input=user_prompt, llm="llama3.3:70b")["message"]["content"]
#
#        elif task == 6:
#
#            system_prompt = prompts["tasks"][task]["system_prompt"]
#            user_prompt = prompts["tasks"][task]["user_prompt"].format(transcript=transcript, questions=tmp[4], answers=tmp[5])
#
#            tmp[task] = ollama_quick_chat(system_message=system_prompt, user_input=user_prompt, llm="llama3.3:70b")["message"]["content"]
#
#        time.sleep(30)
#    time.sleep(120)
#
#    output[transcript_name] = tmp

In [7]:
with open("../data/output_OAI.json", "w") as file:
    json.dump(output, file, indent=4)

In [8]:
#output['TL2 - Full']

In [None]:
# Inicializar variables para el HTML combinado
combined_body = ""

with open("../data/print_friendly_style.txt", "r", encoding="utf-8") as style_file:
    print_friendly_style = style_file.read()

# Iterar sobre las entradas en lecture[2]
for lecture, _ in output.items():
    combined_body = ""  # Reiniciar el cuerpo combinado para cada conferencia
    for _, html_output in output[lecture][3].items():
        # Analizar la cadena HTML actual con BeautifulSoup
        soup = BeautifulSoup(html_output, 'html.parser')
        
        # Combinar el contenido del body
        if soup.body:
            combined_body += soup.body.decode_contents()

    # Crear el documento HTML combinado
    combined_html = f"""<!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Combined Document</title>
        <style>
            {print_friendly_style}
        </style>
    </head>
    <body>
        {combined_body}
    </body>
    </html>"""

    system_prompt = prompts["tasks"][4]["system_prompt"]
    user_prompt = prompts["tasks"][4]["user_prompt"].format(combined_html=combined_html)

    homogen_html = OpenAIVanillaAgent(system_prompt, user_prompt).completion()

    pdfkit.from_string(homogen_html, f"../data/{lecture}_material.pdf", configuration=config)