In [None]:
from langchain_community.document_loaders import YoutubeLoader

YT_URL= "https://www.youtube.com/watch?v=BZbGwnZ6UME"

loader = YoutubeLoader.from_youtube_url(
    YT_URL, language=["en", "es", "de"], translation="de", add_video_info=True,
)

transcript = loader.load()

In [None]:
transcript[0].metadata

In [None]:
len(transcript[0].page_content.split())

In [None]:
from langchain.evaluation.qa import QAEvalChain

from langchain_core.prompts import PromptTemplate

template = """You are a teacher grading a quiz.
You are given a quiz question, the true answer, three incorrect answers, the context from which the question was taken, and a short summary of the topic for which the quiz is intended. Your task is to grade the quiz question based on the following criteria:

1. **Clear and concise**: The question and answers should be easy to understand, well-formulated, and free from ambiguity.
2. **Engaging and challenging**: Incorrect answers should be plausible, making the quiz engaging and requiring thoughtful consideration.
3. **Relatively short**: Answers should be brief, typically no more than a sentence.
4. **Accurately aligned with the specified difficulty level**: The question should match the intended difficulty level. For **easy** quizzes, questions should focus on basic, easily recognizable information. For **medium** quizzes, questions should involve more complex topics requiring a deeper understanding. For **hard** quizzes, questions should require critical thinking, analysis, or interpretation of nuanced information, with particularly convincing incorrect options.
5. *Relevant to the topic**: The quiz question should be relevant to the provided topic, ensuring that it refers to the overall theme of the video. If the question does not refer to the topic as described in the provided video summary, the grade should be 0.00.
6. **Quiz question in correct language**: It is highly important that the quiz question is in the correct language, in this case the quiz language is **{language}** ('EN'=English, 'ES'=Spanish, 'DE'=German). If the question is not in the specified language, give it a grade of 0.00.

GRADE: (range from 0.00 - 4.00) - A grade of '0' means the question does not fulfill any criteria and may not reflect the provided context well. A grade of '4' means the question fully meets all criteria. The more criteria met, the higher the grade should be, up to a maximum of 4. The grade should be a floating point number with two decimal places.

Ignore issues in punctuation. Only provide the grade, no reasoning is required.

\n**QUESTION**: {question}
\n**CORRECT ANSWER**: {correct_answer}
\n**INCORRECT ANSWERS**: {incorrect_answers}
\n**DIFFICULTY**: {difficulty}
\n**CONTEXT**: {context}
\n**TOPIC SUMMARY**: {summary}
\n>>GRADE<<:"""


PROMPT = PromptTemplate(
    input_variables=["question", "correct_answer", "incorrect_answers", "difficulty", "context", "summary", "language"], 
    template=template
)

from langchain_openai import ChatOpenAI
from langchain.chains.llm import LLMChain

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

def get_qa_data(quiz_question):
    
    answers = quiz_question.metadata["answer"]
    return {"question": quiz_question.page_content, "correct_answer": answers["correct_answer"], "incorrect_answers": " ### ".join(answers["wrong_answers"]), "difficulty": "HARD", "context": quiz_question.metadata["context"], "summary": dark_matter_summary, "language": "ES"}


llm_chain = LLMChain(llm=ChatOpenAI(
    temperature=0,
    model="gpt-4o-mini",
), prompt=PROMPT)

In [None]:
vals_converted = [(i, float(grade)) for (i,grade) in vals if float(grade) > 0.00 ]
vals_converted

sorted_vals = sorted(vals_converted, key=lambda grade: grade[1], reverse=True)
sorted_vals

qa_list = [qa_list[i] for i, grade in sorted_vals[:10]]

In [None]:
qa_list

In [None]:
vals = []
qa_list = dark_matter_list

for i in range(len(qa_list)):
    val = llm_chain.run(**get_qa_data(qa_list[i]))
    vals.append((i, val))

In [None]:
transcript[0].metadata

video_metadata = dict(**transcript[0].metadata)
video_metadata.update({"transcript": transcript[0].page_content})

from langchain.docstore.document import Document

doc = Document(page_content=video_metadata["transcript"])

In [None]:
# flake8: noqa
from langchain_core.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

templ1 = """You are a highly intelligent and insightful assistant designed to generate high-quality multiple-choice quiz questions. Your task is to read the provided text, identify key information, and create a corresponding question with four possible answersâ€”one correct answer and three plausible but incorrect alternatives.

Each question and its associated answers should be:
1. **Clear and concise**: Ensure that both the question and the answers are easy to understand, well-formulated, and free from ambiguity.
2. **Engaging and challenging**: The incorrect answers should be plausible enough to make the quiz engaging, requiring thoughtful consideration by the user.
3. **Relatively short**: Answers should be brief, typically no more than a sentence.
4. **Appropriately aligned with the specified difficulty level**: Ensure the generated questions match the intended difficulty. For **easy** quizzes, the questions should focus on basic, easily recognizable information with straightforward answers. For **medium** quizzes, questions should involve more complex topics requiring a deeper understanding or application of the subject. For **hard** quizzes, craft questions that require critical thinking, analysis, or interpretation of nuanced information. The incorrect options should be particularly convincing, making the quiz genuinely challenging at this level.

When generating these question-answer pairs, follow this format:

```
{{
"question": "$YOUR_QUESTION_HERE",
"answer": {{"correct_answer": "$THE_CORRECT_ANSWER_HERE", "wrong_answers": ["$THE_FIRST_WRONG_ANSWER_HERE","$THE_SECOND_WRONG_ANSWER_HERE","$THE_THIRD_WRONG_ANSWER_HERE"]}}
}}
```

Everything between the ``` must be valid JSON, make sure there are no whitespaces or special characters in the output so that it can be parsed to JSON!

Here is an example of how to extract question/answers from a given text:
> Given Text:
"Albert Einstein developed the theory of relativity, one of the two pillars of modern physics. His work also laid the foundation for the development of quantum mechanics."
> Generated question and answers:
"question": "Who developed the theory of relativity?",
"correct_answer": "Albert Einstein",
"wrong_answers": ["Isaac Newton","Niels Bohr","Marie Curie"]
> Additionally, the question/answers should be formated to JSON format as stated above.

You still have to parse the question and answers into a valid JSON as provided above!

"""
templ2 = """Please come up with question/answers pair in JSON format from the following given text with the conditions that:

1. The quiz questions should be of difficulty **{difficulty}**.
2. The quiz questions should be in the same language as the text, in this case in **{language}**.
3. The following text is only a transcript of a YouTube video, refer to it as 'video' not as 'text'.
4. Please extract AT LEAST ONE adquate quiz question from the text, remember to provide it in the mentioned JSON format!

This is your attempt number {num_attempt} in generating a quiz question of this text. If you already tried more than once it means you failed to properly retrieve a quiz question in the necessary JSON format. If that is the case, please vehemently try to parse the quiz question in the given JSON format!

Here is the text:
----------------
{text}"""


def get_qa_prompt(difficulty: str, language: str, num_attempt: int):
    return ChatPromptTemplate.from_messages(
        [
            SystemMessagePromptTemplate.from_template(templ1),
            HumanMessagePromptTemplate.from_template(
                templ2,
                partial_variables={"difficulty": difficulty, "language": language, "num_attempt": num_attempt},
            ),
        ]
    )


get_qa_prompt(difficulty="HARD", language="ES", num_attempt=1).format(text="mytext")

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain_openai import ChatOpenAI
from langchain.chains.llm import LLMChain

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

from langchain_core.prompts import PromptTemplate

prompt_template = """Write a concise summary of the following text and in the same language:
    "{text}"
CONCISE SUMMARY:"""

prompt = PromptTemplate.from_template(prompt_template)

llm_chain = LLMChain(llm=ChatOpenAI(
    temperature=0,
    model="gpt-4o-mini",
), prompt=prompt)

llm_chain.run(doc)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_openai import ChatOpenAI
from langchain.chains.qa_generation.base import QAGenerationChain

import itertools
import uuid
import sys
from pathlib import Path
    
sys.path.append("../")
sys.path.append("../../")
sys.path.append(Path.cwd())

from json import JSONDecodeError

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

import os

os.environ["POSTGRES_DRIVER"] = "psycopg"
os.environ["POSTGRES_HOST"]="localhost"
os.environ["POSTGRES_PORT"]="5432"
os.environ["POSTGRES_DATABASE"] = "quizstream_db"
os.environ["POSTGRES_USER"] = "admin"
os.environ["POSTGRES_PASSWORD"] = "my_password"

def chunk_transcript(doc: Document) -> list[Document]:
        
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=len(doc.page_content) // 7,
        chunk_overlap=50,
        length_function=len,
        add_start_index=True,
    )

    chunks = text_splitter.split_documents([doc])

    # add end_index
    for chunk in chunks:
        chunk.metadata["end_index"] = chunk.metadata["start_index"] + len(
            chunk.page_content
        )

    return chunks

def get_qa_from_chunk(
    chunk: Document,
    qa_generator_chain: QAGenerationChain,
) -> list[dict]:
    try:
        # return list of qa pairs
        qa_pairs = qa_generator_chain.run(chunk.page_content)

        # attach chunk metadata to qa_pair
        for qa_pair in qa_pairs:
            qa_pair["metadata"] = dict(**chunk.metadata)
            qa_pair["metadata"].update(
                {"id": str(uuid.uuid4()), "context": chunk.page_content}
            )

        return qa_pairs

    except JSONDecodeError:
        return [-1]

def generate_qa_from_transcript(transcript: Document) -> list[dict[str, str]]:

    chunks = chunk_transcript(transcript)
    
    llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")
    qa_chain = QAGenerationChain.from_llm(llm, prompt=get_qa_prompt(difficulty="HARD", language="EN", num_attempt=1))

    qa_pairs = [get_qa_from_chunk(chunk, qa_chain) for chunk in chunks]
    qa_pairs = list(itertools.chain.from_iterable(qa_pairs))

    return qa_pairs

data = generate_qa_from_transcript(transcript[0])

In [None]:
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")
qa_chain = QAGenerationChain.from_llm(llm, prompt=get_qa_prompt(difficulty="HARD", language="EN", num_attempt=1))


In [None]:
qa_chain.llm_chain.prompt = get_qa_prompt(difficulty="HARD", language="EN", num_attempt=2)

In [None]:
qa_chain.llm_chain.prompt.messages[1].prompt.partial_variables.update({"num_attempt": 10})

In [None]:
qa_chain.llm_chain.prompt.format(text="hi")

In [None]:
from backend.quiz_generation.generator import agenerate_quiz

import os
import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

col_metadata, qa_ids = await agenerate_quiz("my_xyz_quiz",
                                YT_URL,
                                {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")})

In [None]:
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

# See docker command above to launch a postgres instance with pgvector enabled.
connection_string = "postgresql+psycopg://postgres:pwd@localhost:5432/quizzes"  # Uses psycopg3!
collection_name = "my_docs"

import datetime as dt

from langchain_core.embeddings import FakeEmbeddings
embeddings = FakeEmbeddings(size=1)

vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection_string,
    use_jsonb=True,
    collection_metadata={"date_created": dt.datetime.now(dt.UTC).strftime('%Y-%m-%dT%H:%M:%SZ'), 'num_tries': 1, 'acc':0.2}
)

In [None]:
from sqlalchemy import create_engine, MetaData, Table, select

TABLE_COLLECTION = "langchain_pg_collection"
TABLE_DOCS = "langchain_pg_embedding"

def list_collections() -> list[str]:
    # Create an engine
    engine = create_engine(connection_string)

    # Reflect the specific table
    metadata = MetaData()
    table = Table(TABLE_COLLECTION, metadata, autoload_with=engine)

    # Query the column
    query = select(table.c["name"])
    with engine.connect() as connection:
        results = connection.execute(query).fetchall()

    return results

def get_by_ids(ids: list[str]) -> list[str]:
    # Create an engine
    engine = create_engine(connection_string)

    # Reflect the specific table
    metadata = MetaData()
    table = Table(TABLE_DOCS, metadata, autoload_with=engine)

    # Query the column
    query = select(table).where(table.c.id.in_(ids))
    with engine.connect() as connection:
        results = connection.execute(query).fetchall()

    return results

def get_all_by_collection_id(engine:Engine, collection_id: str):

    # Reflect the specific table
    table = Table(TABLE_DOCS, MetaData(), autoload_with=engine)

    # Query the column
    query = select(table).where(table.c.collection_id == collection_id)
    with engine.connect() as connection:
        results = connection.execute(query).fetchall()

    return results

list_collections(), get_by_ids(["aad30221-cdc1-4add-86db-f4067ee9d8c7"]), get_all_by_collection_id(engine, "994272b7-354d-4c98-9c92-8d4b23b60e62")