In [7]:
# TODO
    # # Hard condition on count of videos (20 max) ✅
    # Compressed String -> GPT (find source)
    # Increase number of rows in test data to 3000 ✅

# Updates
    # Convert generate_prompt from Function() to simple f"string"
    # Max number_of_videos => 20 enforced
        # Instructed GPT specifically to make it less than 20
        # If 20 is reached, then the prompt is re-run (while statement)

# NOTES
    # Used: https://platform.openai.com/tokenizer
    # Token Costs (4-o)
        #extract_details() -> 137 Tokens (706 chars)  @  Jun 20 2024
        #generate_curicculum() -> 70 Tokens (353 chars)  @  Jun 20 2024

In [5]:
# CONSTANTS

import openai
import re
import pandas as pd
import chromadb

from pprint import pprint
from chromadb.utils import embedding_functions

# AI-Related Constants
OPENAI_API_KEY = ""
OPENAI_EmbeddingModel = "text-embedding-3-small"

# ChromaDB
chromadb_name = "Edgur_Video_DB_Vectorstore_SEQUENTIAL"
chroma_client = chromadb.PersistentClient(path="db")

# DB
db_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSAE2tBAnAdXsxk9a9YClFN7MSEVhzEmJD01ewwtooMLxL-Ilod26EbdD8sZeZk0ybiqD-jqT-9RZbn/pub?gid=1883452605&single=true&output=csv"  # test spreadsheet
df = pd.read_csv(db_url)

In [12]:

def extract_details(user_input):
    """
    This function takes a user input and sends it to the OpenAI GPT-4o model to extract details about a potential educational playlist.

    Parameters:
    user_input (str): The user's query about a potential educational playlist.

    Returns:
    str: A string containing the extracted details, separated by tildes (~).
    """
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are an education-focused assistant. Your task is to extract key details from user queries "
                    "to help them create effective educational playlists. "
                    "Your response should be a single string with each value separated by a tilde (~)."
                    "If no input is given, make an intelligent, educated guess on what the values should be."
                ),
            },
            {
                "role": "system",
                "content": (
                    "Please extract and format the following details in the specified order: "
                    "1. Detailed topic (the more detailed, the better), "
                    "2. Difficulty level (beginner, intermediate, advanced), "
                    "3. Number of videos needed to form a playlist (integers only, less than 20), "
                    "4. Name of the playlist, "
                    "5. A concise description of the playlist."
                ),
            },
            {
                "role": "user",
                "content": f"User query: '{user_input}'",
            },
            {
                "role": "user",
                "content": (
                    "Extracted details should be formatted as: "
                    "Detailed Topic ~ Difficulty Level ~ Number of Videos ~ Playlist Name ~ Playlist Description"
                ),
            },
        ],
    )
    details = response.choices[0].message["content"].strip()
    return details


def split_details(details):
    """
    This function takes a string of details separated by tildes (~) and splits it into individual components.

    Parameters:
    details (str): A string containing the details, separated by tildes (~).

    Returns:
    tuple: A tuple containing the topic, difficulty level, number of videos, playlist title, and playlist description.
    """
    topic, difficulty, number_of_videos, playlist_title, playlist_desc = details.split(
        "~"
    )
    return (
        topic.strip(),
        difficulty.strip(),
        number_of_videos.strip(),
        playlist_title.strip(),
        playlist_desc.strip(),
    )

In [13]:
def generate_curriculum(topic, difficulty, number_of_videos):
    """
    Generates a curriculum for a given topic with specified difficulty and number of videos.

    Parameters:
    topic (str): The subject of the curriculum.
    difficulty (str): The difficulty level of the curriculum.
    number_of_videos (int): The number of videos in the curriculum.

    Returns:
    str: A list of video titles for the curriculum.
    """
    prompt = (
        f"Create a short video curriculum on '{topic}' with a(n) {difficulty} difficulty level, "
        f"designed to be covered in exactly {number_of_videos} videos."
    )

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an experienced educational computer scientist now teaching others."},
            {"role": "system", "content": (
                "Return only a list of video titles, each title on a new line, with no sublists or additional text. "
                "Ensure the number of video titles matches exactly the number specified in the prompt. "
                "Each title should be detailed enough for easy querying in a vector store database."
            )},
            {"role": "user", "content": prompt},
        ]
    )

    return response.choices[0].message['content'].strip()

In [27]:
def generate_curriculum_based_on_query(
    prompt: str, playlist_details: bool = False, debug: bool = False
) -> str:
    """
    This function generates a curriculum based on the provided prompt.

    Args:
        prompt (str): The search query.
        metadata (bool, optional): If True, prints the playlist title and description. Defaults to False.
        debug (bool, optional): If True, prints debug information. Defaults to False.

    Returns:
        str: The generated curriculum.
    """

    # Extract details from the prompt
    details = extract_details(prompt)
    topic, difficulty, number_of_videos, playlist_title, playlist_desc = split_details(
        details
    )

    # Convert number_of_videos to integer
    # Hanldes GPT hallucinations
    number_of_videos = int(re.findall(r"\d+", number_of_videos)[0])

    # If number_of_videos is more than 20 videos, call extract_details again
    # Expected to return an error message too
    if number_of_videos > 20:
        raise ValueError("Max number of videos cannot be more than 20")

    # Generate the curriculum
    curriculum = generate_curriculum(topic, difficulty, number_of_videos)

    # Split the curriculum into a list of video titles for semantic search
    curriculum_videos = [
        video.strip() for video in curriculum.split("\n") if video.strip()
    ]

    # Print playlist title/desc if requested
    if playlist_details:
        print(f"\n{playlist_title}\n{playlist_desc}\n")

    # Print debug information if requested
    if debug:
        debug_info = {
            "Topic": topic,
            "Difficulty": difficulty,
            "Number of Videos": number_of_videos,
            "Playlist Title": playlist_title,
            "Playlist Description": playlist_desc,
        }

        print("\nPrinting debug...\n")
        pprint(debug_info)
        print("\nPrinting generated curriculum...\n")
        pprint(curriculum_videos)

    return curriculum_videos

In [28]:
# Example usage:
curriculum = generate_curriculum_based_on_query("I want to learn about neural networks", playlist_details=True, debug=True)


Printing debug...

{'Difficulty': 'Beginner',
 'Number of Videos': 10,
 'Playlist Description': 'This playlist covers the basic concepts and '
                         'fundamentals of neural networks for beginners.',
 'Playlist Title': 'Introduction to Neural Networks',
 'Topic': 'Neural Networks'}

Printing generated curriculum...

['1. Introduction to Neural Networks: Understanding the Basics',
 '2. Building Blocks of Neural Networks: Neurons and Synapses',
 '3. Layers in Neural Networks: Hidden, Input and Output Explained',
 '4. Introduction to Activation Functions in Neural Networks',
 '5. Understanding Feedforward and Backpropagation in Neural Networks',
 '6. Techniques for Training Neural Networks: Part 1',
 '7. Techniques for Training Neural Networks: Part 2',
 '8. Practical Applications of Neural Networks in Everyday Life',
 "9. Getting Started with Neural Networks in Python: A Beginner's Guide",
 "10. Review and Next Steps: Wrap-Up of Beginner's Neural Networks Course"]

Int

____

In [21]:
def initialize_chromadb():
    """
    Initialize ChromaDB client and collection.

    Returns:
    collection: Initialized ChromaDB collection.
    """

    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=OPENAI_API_KEY, model_name=OPENAI_EmbeddingModel
    )
    collection = chroma_client.get_or_create_collection(
        name=chromadb_name,
        embedding_function=openai_ef,
        metadata={"hnsw:space": "cosine"},
    )
    return collection


def add_or_update_chromadb_rows(df, collection):
    """
    Add or update rows in the ChromaDB collection.

    Parameters:
    df (DataFrame): DataFrame containing video data.
    collection: ChromaDB collection.
    """
    documents = df["tags"].apply(lambda x: x.split(",")).tolist()
    documents_str = [", ".join(doc) for doc in documents]
    ids = [str(i + 1) for i in range(len(documents_str))]
    collection.upsert(documents=documents_str, ids=ids)


In [22]:
collection = initialize_chromadb()
add_or_update_chromadb_rows(df, collection)

In [23]:
# Retrieve videos for each curriculum item
def retrieve_videos(curriculum, n_results=10):
    global watched_videos
    video_sequence = []

    for item in curriculum:
        results = collection.query(query_texts=item, n_results=n_results)

        for i in range(len(results["ids"][0])):
            result_id = int(results["ids"][0][i])
            if result_id not in watched_videos:
                watched_videos.add(result_id)  # Mark the video as watched

                title = df.loc[df["video_id"] == result_id, "video_title"].values[0]

                video_sequence.append({"id": result_id, "title": title})
                break  # Break the loop as soon as we find an unwatched video

    return video_sequence

In [24]:
# Initialize a set to store watched video ids
# If you want to empty, run this cell again.
watched_videos = set()

In [26]:
# Example usage:
curriculum = generate_curriculum_based_on_query("cybersecurity for beginner short time", playlist_details=True, debug=True)
retrieve_videos(curriculum, len(curriculum))

{'Difficulty': 'Beginner',
 'Number of Videos': 10,
 'Playlist Description': 'A concise playlist designed for beginners to quickly '
                         'grasp the fundamentals of cybersecurity.',
 'Playlist Title': 'Introduction to Cybersecurity',
 'Topic': 'Cybersecurity Fundamentals'}

Introduction to Cybersecurity
A concise playlist designed for beginners to quickly grasp the fundamentals of cybersecurity.


[{'id': 65, 'title': 'Introduction to Cybersecurity'},
 {'id': 110, 'title': 'Network Security'},
 {'id': 66, 'title': 'Understanding Cyber Threats'},
 {'id': 112, 'title': 'Secure Coding Practices'},
 {'id': 107, 'title': 'Introduction to Cybersecurity'},
 {'id': 72, 'title': 'Security Policies and Compliance'},
 {'id': 70, 'title': 'Secure Coding Practices'},
 {'id': 120, 'title': 'Firewalls and Intrusion Detection Systems (IDS)'},
 {'id': 68, 'title': 'Network Security'},
 {'id': 136, 'title': 'Future Trends in Cybersecurity'}]