In [2]:
from langchain_community.document_loaders import YoutubeLoader

YT_URL= "https://www.youtube.com/watch?v=IFx8eABfivg"

loader = YoutubeLoader.from_youtube_url(
    YT_URL, language=["en", "es", "de"], translation="de", add_video_info=True,
)

transcript = loader.load()

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain_openai import ChatOpenAI
from langchain.chains.llm import LLMChain
from langchain.docstore.document import Document
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

from langchain_core.prompts import PromptTemplate

prompt_template = """
Write a concise and simple explanation of the following error message in a way that is easily understandable by someone without a technical background. 

If there was a problem with transcript retrieval of youtube video, add warning that app only supports given list of languages. 

Focus on the key issue and avoid technical terms:
    "{text}"
CONCISE SUMMARY (limit to one or two sentences):"""

prompt = PromptTemplate.from_template(prompt_template)

llm = llm=ChatOpenAI(
    temperature=0,
    model="gpt-4o-mini",
)

llm_chain = LLMChain(llm=llm, prompt=prompt)

doc = Document(page_content=msg2)

summarizing_chain = prompt | llm | StrOutputParser()


summarizing_chain.invoke(doc)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_openai import ChatOpenAI
from langchain.chains.qa_generation.base import QAGenerationChain

import itertools
import uuid
import sys
from pathlib import Path
    
sys.path.append("../")
sys.path.append("../../")
sys.path.append(Path.cwd())

from json import JSONDecodeError

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

import os

os.environ["POSTGRES_DRIVER"] = "psycopg"
os.environ["POSTGRES_HOST"]="localhost"
os.environ["POSTGRES_PORT"]="5432"
os.environ["POSTGRES_DATABASE"] = "quizstream_db"
os.environ["POSTGRES_USER"] = "admin"
os.environ["POSTGRES_PASSWORD"] = "my_password"

def chunk_transcript(doc: Document) -> list[Document]:
        
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=len(doc.page_content) // 7,
        chunk_overlap=50,
        length_function=len,
        add_start_index=True,
    )

    chunks = text_splitter.split_documents([doc])

    # add end_index
    for chunk in chunks:
        chunk.metadata["end_index"] = chunk.metadata["start_index"] + len(
            chunk.page_content
        )

    return chunks

def get_qa_from_chunk(
    chunk: Document,
    qa_generator_chain: QAGenerationChain,
) -> list[dict]:
    try:
        # return list of qa pairs
        qa_pairs = qa_generator_chain.run(chunk.page_content)

        # attach chunk metadata to qa_pair
        for qa_pair in qa_pairs:
            qa_pair["metadata"] = dict(**chunk.metadata)
            qa_pair["metadata"].update(
                {"id": str(uuid.uuid4()), "context": chunk.page_content}
            )

        return qa_pairs

    except JSONDecodeError:
        return [-1]

def generate_qa_from_transcript(transcript: Document) -> list[dict[str, str]]:

    chunks = chunk_transcript(transcript)
    
    llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")
    qa_chain = QAGenerationChain.from_llm(llm, prompt=get_qa_prompt(difficulty="HARD", language="EN", num_attempt=1))

    qa_pairs = [get_qa_from_chunk(chunk, qa_chain) for chunk in chunks]
    qa_pairs = list(itertools.chain.from_iterable(qa_pairs))

    return qa_pairs

data = generate_qa_from_transcript(transcript[0])

In [None]:
from backend.quiz_generation.generator import agenerate_quiz

import os
import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

col_metadata, qa_ids = await agenerate_quiz("my_xyz_quiz",
                                YT_URL,
                                {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")})

In [None]:
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

# See docker command above to launch a postgres instance with pgvector enabled.
connection_string = "postgresql+psycopg://postgres:pwd@localhost:5432/quizzes"  # Uses psycopg3!
collection_name = "my_docs"

import datetime as dt

from langchain_core.embeddings import FakeEmbeddings
embeddings = FakeEmbeddings(size=1)

vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection_string,
    use_jsonb=True,
    collection_metadata={"date_created": dt.datetime.now(dt.UTC).strftime('%Y-%m-%dT%H:%M:%SZ'), 'num_tries': 1, 'acc':0.2}
)

In [None]:
from sqlalchemy import create_engine, MetaData, Table, select

TABLE_COLLECTION = "langchain_pg_collection"
TABLE_DOCS = "langchain_pg_embedding"

def list_collections() -> list[str]:
    # Create an engine
    engine = create_engine(connection_string)

    # Reflect the specific table
    metadata = MetaData()
    table = Table(TABLE_COLLECTION, metadata, autoload_with=engine)

    # Query the column
    query = select(table.c["name"])
    with engine.connect() as connection:
        results = connection.execute(query).fetchall()

    return results

def get_by_ids(ids: list[str]) -> list[str]:
    # Create an engine
    engine = create_engine(connection_string)

    # Reflect the specific table
    metadata = MetaData()
    table = Table(TABLE_DOCS, metadata, autoload_with=engine)

    # Query the column
    query = select(table).where(table.c.id.in_(ids))
    with engine.connect() as connection:
        results = connection.execute(query).fetchall()

    return results

def get_all_by_collection_id(engine:Engine, collection_id: str):

    # Reflect the specific table
    table = Table(TABLE_DOCS, MetaData(), autoload_with=engine)

    # Query the column
    query = select(table).where(table.c.collection_id == collection_id)
    with engine.connect() as connection:
        results = connection.execute(query).fetchall()

    return results

list_collections(), get_by_ids(["aad30221-cdc1-4add-86db-f4067ee9d8c7"]), get_all_by_collection_id(engine, "994272b7-354d-4c98-9c92-8d4b23b60e62")