___
# User Interest Algorithm


In [60]:
import random
import pandas as pd
import chromadb

from pprint import pprint
from chromadb.utils import embedding_functions

# Constants

NEW_VIDEO_WEIGHT = 10
STARTING_WEIGHT = 10

USER_MIN_INTEREST = 4
USER_MAX_INTEREST = 8

# Number of vids to queue before running query again (to save costs)
COUNT_QUEUED_VIDEOS = 5
    # AKA n_results = 3

# User Actions
LIKE = 1
SHARE = 2
WATCH = 1  # more than 50% of total duration
LOOP = 1

# AI-Related Constants
OPENAI_API_KEY = ""
OPENAI_EmbeddingModel = "text-embedding-3-small"

In [61]:

chroma_client = chromadb.PersistentClient(path="db")

# Use OpenAI as the embedding model (word to vector)
OPENAI_EF = embedding_functions.OpenAIEmbeddingFunction(
                api_key=OPENAI_API_KEY,
                model_name=OPENAI_EmbeddingModel
            )

# Create a vectorstore database and use Cosine Similarity for semantic search within the DB
collection = chroma_client.get_or_create_collection(
    name="Edgur_Video_DB_Vectorstore",
    embedding_function=OPENAI_EF,
    metadata={"hnsw:space": "cosine"},
)



In [62]:
print("Number of documents that can be inserted at once: ", chroma_client.max_batch_size)

Number of documents that can be inserted at once:  41666


In [63]:
def DebugPrint(msg, function_name):
    print("START ------- DEBUG (INFUNC):", function_name, "\n")
    pprint(msg)
    print("\nEND ------- DEBUG (INFUNC):", function_name, "\n\n\n")

In [64]:
def create_user_interest_dict(user_interest_list):
    if len(user_interest_list) < USER_MIN_INTEREST:
        raise ValueError("You must have at least 4 interests")
    elif len(user_interest_list) > USER_MAX_INTEREST:
        raise ValueError("You may have 8 interests at most")

    user_dict_interest = {interest: STARTING_WEIGHT for interest in user_interest_list}
    user_dict_interest["Random"] = STARTING_WEIGHT

    DebugPrint(user_dict_interest, "create_user_interest_dict")

    return user_dict_interest

In [65]:
def observe_action_taken(
    interest_tag, liked=False, shared=False, watched=False, loop_count=0
):
    """
    This function calculates the total points based on the user's actions.

    Parameters:
    interest_tag (str): The tag of the video.
    liked (bool): Whether the user liked the video.
    shared (bool): Whether the user shared the video.
    watched (bool): Whether the user watched the video.
    loop_count (int): The number of times the user looped the video.

    Returns:
    float: The total points calculated based on the user's actions.
    """
    # Actions and their corresponding points to add
    Actions = {
        "LIKE": LIKE,
        "SHARE": SHARE,
        "WATCH": WATCH,  # more than 50% of total duration
        "LOOP": LOOP,
    }

    total_action_points = 0

    if liked:
        total_action_points += Actions["LIKE"]
    if shared:
        total_action_points += Actions["SHARE"]
    if watched:
        total_action_points += Actions["WATCH"]
    total_action_points += loop_count * Actions["LOOP"]

    # DebugPrint("Updating weights for interest: {}. Total points to add: {} (10 if new interest).".format(interest_tag, total_action_points), "observe_action_taken()")

    return total_action_points

In [66]:
# observe_action_taken(chosen_interest_tag, liked=True, shared=True, watched=True, loop_count=1)

In [67]:
def recalculate_percentages(user_dict_interest):
    """
    Recalculate the percentage of each interest relative to the total weight.
    """
    total_weight = sum(user_dict_interest.values())

    user_dict_percentage = {}

    for interest_tag, weight in user_dict_interest.items():
        user_dict_percentage[interest_tag] = round((weight / total_weight) * 100, 2)

    DebugPrint(user_dict_percentage, "recalculate_percentages (%)")
    return user_dict_percentage

In [68]:
# recalculate_percentages(create_user_interest_dict(user1_interests))

In [69]:
def handle_interruptions(user_dict_percentage):
    """
    Handle intentional interruptions and suggest relevant interests (50% dominance) or recommend more of the same interest.

    Parameters:
    user_dict_percentage (dict): A dictionary where keys are interest tags and values are their percentages.
    """
    # Get the interest with the highest percentage
    interest_tag, interest_percentage = max(user_dict_percentage.items(), key=lambda item: item[1])

    if interest_percentage > 75:
        print(f"Have you been enjoying {interest_tag} so far?")
        response = input()  # Get user's response
        if response.lower() == "yes":
            print("Recommend less of the same interest.")
            # Add checkbox code to recommend more videos of the same interest

    elif interest_percentage > 50:
        print("Suggesting relevant interests...")
        # Add slider code

    DebugPrint("The highest tag is {}. It dominates the interest weights by {}%.".format(interest_tag, interest_percentage), "handle_interruptions")


In [70]:
# handle_interruptions(recalculate_percentages(create_user_interest_dict(user1_interests)))

In [71]:
def note_actions_and_update_weights(
    user_dict_interest,
    interest_tag,
    liked=False,
    shared=False,
    watched=False,
    loop_count=0,
):
    TOTAL_ACTION_POINTS = observe_action_taken(
        interest_tag, liked, shared, watched, loop_count
    )

    # Update raw weights
    if interest_tag in user_dict_interest:
        print("Updating weights for interest: {}. Total points to add: {}.".format(interest_tag, TOTAL_ACTION_POINTS))
        user_dict_interest[interest_tag] += TOTAL_ACTION_POINTS
        print("Updated weight for interest: {}. New weight: {}.".format(interest_tag, user_dict_interest[interest_tag]))
    else:
        print("Adding new interest: {}. Total points to add: 10 (default).".format(interest_tag, TOTAL_ACTION_POINTS))
        user_dict_interest[interest_tag] = STARTING_WEIGHT
        print("Added new interest: {}. Weight: {}.".format(interest_tag, user_dict_interest[interest_tag]))

    # Recalculate percentages
    user_dict_percentage = recalculate_percentages(user_dict_interest)

    # Handle interruptions
    handle_interruptions(user_dict_percentage)

    # Sort the interests ("key=item[1]") by weight in descending order and keep only the top 10
    user_dict_interest = dict(
        sorted(user_dict_interest.items(), key=lambda item: item[1], reverse=True)[:10]
    )

    return user_dict_interest

In [72]:
# updated_user1_dict = note_actions_and_update_weights(create_user_interest_dict(user1_interests), "Dogs", liked=True, shared=True, watched=True, loop_count=1)

In [73]:
def get_interest_tag_for_recommendation(user_dict_interest):
    # Recalculate percentages
    user_dict_percentage = recalculate_percentages(user_dict_interest)

    # Generate a random number between 0 and 100
    random_value = random.randint(0, 100)

    print("_" * 50)
    print(f"\nRandom value: {random_value}\n")

    cumulative_percentage = 0

    print("Based on User_Dictionary_Percentage (%)\n")

    for interest_tag, percentage in user_dict_percentage.items():
        # Save the starting point of the current tag's range (in percentage)
        previous_cumulative_percentage = cumulative_percentage

        # Add the tag's percentage to the cumulative total to get the end point of the tag's range
        cumulative_percentage += percentage

        print(f"Checking interest: {interest_tag}, cumulative range: {int(previous_cumulative_percentage)} - {int(cumulative_percentage)}")

        # If the random value is less than or equal to the cumulative percentage up to the current interest,
        # select the current interest and stop looking at the rest of the interests.
        if random_value <= cumulative_percentage:
            print(f"\nSelected interest: {interest_tag}\n")
            break

    if interest_tag == "Random":
        interest_tag = random.choice(list(user_dict_percentage.keys()))
        print(f"!!Random!! tag selected: {interest_tag}")
        print("MODIFY THIS LATER TO POINT TO CSV COLUMN")
        return interest_tag

    DebugPrint('Chosen interest tag: "{}"'.format(interest_tag), "get_interest_tag_for_recommendation()")
    return interest_tag

___
# Ranking Algorithm


In [74]:
# Function for adding new rows to the collection

# Convert the 'tags' column to a list of strings (grouped according to its respective row)
def add_or_update_ChromaDB_rows():
    documents = df["tags"].apply(lambda x: x.split(",")).tolist()
    documents_str = [", ".join(doc) for doc in documents]

    # Add the documents to the collection
    # todo: handle UUIDs
    ids = [str(i + 1) for i in range(len(documents_str))]

    # Add new rows to the collection
    collection.upsert(documents=documents_str, ids=ids)

    DebugPrint("Added new rows to the collection.", "add_new_rows()")

In [75]:
# add_new_rows()

In [76]:
# Functions handling the watched videos dictionary

def mark_as_watched(video_id, title, watched_dict):
    watched_dict[video_id] = title
    DebugPrint('"{}" with ID {} has been added to watched videos.'.format(title, video_id), "mark_as_watched")
    return watched_dict

def show_video_again(video_id, title, watched_dict):
    watched_dict.pop(video_id)
    DebugPrint('"{}" with ID {} has been removed from watched videos. It will be played again on algorithm match.'.format(title, video_id), "show_video_again")
    return watched_dict

In [77]:
# mark_as_watched("1", "Dictionaries", watched)
# show_video_again("1", "Dictionaries", watched)

In [78]:
# Function to add videos to the queue
def add_to_queue(interest_tag, n_results=5):

    original_interest_tag = interest_tag  # Save the original interest tag

    # Use ChromaDB to query top 5 videos closest to given interest
    results = collection.query(query_texts=[interest_tag], n_results=n_results)
    DebugPrint('Queried top {} videos for tag "{}"'.format(n_results, interest_tag), "add_to_queue")

    # Check if there are not enough results for the given interest tag
    if len(results["ids"][0]) < n_results:
        print(f"Not enough results for interest tag '{interest_tag}'. Needed {n_results}, but got {len(results['ids'][0])}.")
        print("Algorithm set to Random mode.")
        interest_tag = "Random"
        watched.clear()
        results = collection.query(query_texts=[interest_tag], n_results=n_results)

    # If more videos are added, remove the "Random" mode
    if len(results["ids"][0]) > n_results:
        print("More videos added. Removing Random mode.")
        interest_tag = original_interest_tag
        results = collection.query(query_texts=[interest_tag], n_results=n_results)

    # Add the videos to the queue
    for i in range(n_results):
        result_id = int(results["ids"][0][i])
        title = df.loc[df["video_id"] == result_id, "video_title"].values[0]
        tag = (
            df.loc[df["video_id"] == result_id, "tags"].values[0].split(",")[0]
        )  # get the first tag

        # Check if the video has already been watched by user
        if result_id not in watched:
            queue[result_id] = tag
            print(
                f'Added "{title}" with ID {result_id} and tag "{tag}" to the queue'
            )  # Debugging line

    return queue

In [79]:
def view_video(queue, user_dict_interest, watched_dict):
    if not queue:
        print("Queue is empty! Adding more videos...")
        new_interest_tag = get_interest_tag_for_recommendation(user_dict_interest)
        add_to_queue(new_interest_tag, COUNT_QUEUED_VIDEOS)


    video_id, interest_tag = next(iter(queue.items()))
    del queue[video_id]
    title = df.loc[df["video_id"] == video_id, "video_title"].values[0]
    mark_as_watched(video_id, title, watched_dict)
    note_actions_and_update_weights(user_dict_interest, interest_tag, liked=True, watched=True)

    if len(queue) <= 2:
        print("\n2 cached videos left. \nAdding more videos to the queue...\n")
        new_interest_tag = get_interest_tag_for_recommendation(user_dict_interest)
        add_to_queue(new_interest_tag, COUNT_QUEUED_VIDEOS)

    DebugPrint("Viewed video: {}. Queue: {}. Watched: {}".format(title, queue, watched_dict), "view_video")
    return queue

In [80]:
# Simulation Only (replace user input)
chosen_interest_tag = "Dictionaries"
user1_interests = ["DevOps", "Tests (SAT)", "Machine Learning", "Dogs"] # count: 4

queue = {}  # should be in class
watched = {}  # Should be in a class

add_or_update_ChromaDB_rows()
new_user_dict = create_user_interest_dict(user1_interests)

n = 6  # Replace with the number of times you want to run the simulation
for _ in range(n):
    view_video(queue, new_user_dict, watched)

START ------- DEBUG (INFUNC): add_new_rows() 

'Added new rows to the collection.'

END ------- DEBUG (INFUNC): add_new_rows() 



START ------- DEBUG (INFUNC): create_user_interest_dict 

{'DevOps': 10,
 'Dogs': 10,
 'Machine Learning': 10,
 'Random': 10,
 'Tests (SAT)': 10}

END ------- DEBUG (INFUNC): create_user_interest_dict 



Queue is empty! Adding more videos...
START ------- DEBUG (INFUNC): recalculate_percentages (%) 

{'DevOps': 20.0,
 'Dogs': 20.0,
 'Machine Learning': 20.0,
 'Random': 20.0,
 'Tests (SAT)': 20.0}

END ------- DEBUG (INFUNC): recalculate_percentages (%) 



__________________________________________________

Random value: 71

Based on User_Dictionary_Percentage (%)

Checking interest: DevOps, cumulative range: 0 - 20
Checking interest: Tests (SAT), cumulative range: 20 - 40
Checking interest: Machine Learning, cumulative range: 40 - 60
Checking interest: Dogs, cumulative range: 60 - 80

Selected interest: Dogs

START ------- DEBUG (INFUNC): get_interest_tag

In [81]:
# Frontend implementation scope
    # ACTIONS: Watched and Likes only
        # Watched when 70%

# Backend robustness (view_video())
    # If len queue still 0, then cache 5 random videos (even in marked as watched)

In [82]:
# Update relative calculation (100)

# User schema
    # Dict of Interests
    # Dict `watched`
    # Dict `queue`

# Research Relational database
    # Tables how do we store?