In [16]:
# video_recommender/chromadb_helper.py

import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
from constants import *

# video_recommender/constants.py
# video_recommender/constants.py
import os
from dotenv import load_dotenv
import pandas as pd

load_dotenv()  # take environment variables from .env.

NEW_VIDEO_WEIGHT = 10
STARTING_WEIGHT = 10
USER_MIN_INTEREST = 4
USER_MAX_INTEREST = 8
COUNT_QUEUED_VIDEOS = 5


# User Actions
LIKE = 1
SHARE = 2
WATCH = 1  # more than 50% of total duration
LOOP = 1

# AI-Related Constants
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_EmbeddingModel = "text-embedding-3-small"

# ChromaDB
chromadb_name = "Edgur_Video_DB_Vectorstore"

# DB
db_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSAE2tBAnAdXsxk9a9YClFN7MSEVhzEmJD01ewwtooMLxL-Ilod26EbdD8sZeZk0ybiqD-jqT-9RZbn/pub?gid=497214901&single=true&output=csv"  # test spreadsheet
df = pd.read_csv(db_url)

In [None]:

def initialize_chromadb():
    """
    Initialize ChromaDB client and collection.

    Returns:
    collection: Initialized ChromaDB collection.
    """
    chroma_client = chromadb.PersistentClient(path="db")
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        api_key=OPENAI_API_KEY, model_name=OPENAI_EmbeddingModel
    )
    collection = chroma_client.get_or_create_collection(
        name=chromadb_name,
        embedding_function=openai_ef,
        metadata={"hnsw:space": "cosine"},
    )
    return collection


def add_or_update_chromadb_rows(df, collection):
    """
    Add or update rows in the ChromaDB collection.

    Parameters:
    df (DataFrame): DataFrame containing video data.
    collection: ChromaDB collection.
    """
    documents = df["tags"].apply(lambda x: x.split(",")).tolist()
    documents_str = [", ".join(doc) for doc in documents]
    ids = [str(i + 1) for i in range(len(documents_str))]
    collection.upsert(documents=documents_str, ids=ids)


In [None]:
collection = initialize_chromadb()
add_or_update_chromadb_rows(df, collection)

In [14]:
import openai
import chromadb
from chromadb.config import Settings

# Initialize ChromaDB
chroma_client = chromadb.Client(Settings())

# Define template for the system message
system_message_template = """
You are a seasoned developer and computer scientist turned educator, specializing in creating structured learning paths for students of various levels. You have extensive experience in designing curriculums that progress from beginner to intermediate to advanced topics strictly in bullet points. You do not say anything else apart from said bullet points.
"""

# Define template for the user message
user_message_template = "Generate a 10-item-only (no subtopics) curriculum for learning {topic} that progresses from beginner to intermediate to advanced topics."

# Define function to generate curriculum
def generate_curriculum(topic):
    user_message = user_message_template.format(topic=topic)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_message_template},
            {"role": "user", "content": user_message}
        ]
    )
    curriculum = response['choices'][0]['message']['content']
    return curriculum.strip().split('\n')

In [15]:
generate_curriculum("MySQL")

['- Introduction to Databases and MySQL',
 '- Basic SQL Commands (SELECT, INSERT, UPDATE, DELETE)',
 '- Filtering and Sorting Data',
 '- Working with Functions and Aggregations',
 '- Joins and Unions',
 '- Indexing and Optimization',
 '- Transactions and Locking',
 '- Stored Procedures and Functions',
 '- Triggers and Events',
 '- Advanced Query Optimization and Performance Tuning']

In [None]:

# Load video data into ChromaDB (dummy data for example)
video_data = [
    {"id": "video1", "title": "Introduction to SQL", "embedding": [0.1, 0.2, ...]},
    {"id": "video2", "title": "Basic SQL Terms", "embedding": [0.3, 0.1, ...]},
    # ... more videos
]

# Add video data to ChromaDB
for video in video_data:
    chroma_client.add_document(
        id=video["id"],
        text=video["title"],
        embedding=video["embedding"]
    )


In [None]:

# Retrieve videos for each curriculum item
def retrieve_videos(curriculum):
    video_sequence = []
    for item in curriculum:
        results = chroma_client.query(item)
        best_match = results[0]  # Assuming results are sorted by relevance
        video_sequence.append(best_match["id"])
    return video_sequence


In [None]:

# Main process
topic = "SQL"
curriculum = generate_curriculum(topic)
video_sequence = retrieve_videos(curriculum)

print("Generated Curriculum:")
print(curriculum)

print("Video Sequence:")
print(video_sequence)
