___
# User Interest Algorithm


In [74]:
import random
import pandas as pd
import chromadb

from pprint import pprint
from chromadb.utils import embedding_functions


# Constants

new_video_weight = 10
starting_weight = 10

user_min_interest = 4
user_max_interest = 8

count_queued_videos = 5

# Number of vids to queue before running query again (to save costs)
# AKA n_results = 3.

# User Actions
Like = 1
Share = 2
Watch = 1  # more than 50% of total duration
Loop = 1

# Dictionaries - remove
watched_videos = {}

# AI-Related Constants
OPENAI_API_KEY = "sk-"
OPENAI_EmbeddingModel = "text-embedding-3-small"
chroma_client = chromadb.PersistentClient(path="db")

# Use OpenAI as the embedding model (word to vector)
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=OPENAI_API_KEY,
                model_name=OPENAI_EmbeddingModel
            )

# Create a vectorstore database and use Cosine Similarity for semantic search within the DB
collection = chroma_client.get_or_create_collection(
    name="Edgur_Video_DB_Vectorstore",
    embedding_function=openai_ef,
    metadata={"hnsw:space": "cosine"},
)

# DB
db_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSAE2tBAnAdXsxk9a9YClFN7MSEVhzEmJD01ewwtooMLxL-Ilod26EbdD8sZeZk0ybiqD-jqT-9RZbn/pub?gid=497214901&single=true&output=csv" # test spreadsheet
df = pd.read_csv(db_url)

# Simulation Only (replace user input)
chosen_interest_tag = "Dictionaries"
user1_interests = ["DevOps", "Tests (SAT)", "Machine Learning", "Cats", "Cooking"] # count: 5


In [75]:
def create_user_interest_dict(user_interest_list):
    if len(user_interest_list) < user_min_interest:
        raise ValueError("You must have at least 4 interests")
    elif len(user_interest_list) > user_max_interest:
        raise ValueError("You may have 8 interests at most")

    user_dict_interest = {interest: starting_weight for interest in user_interest_list}
    user_dict_interest["Random"] = starting_weight
    return user_dict_interest

In [76]:
def observe_action_taken(
    interest_tag, liked=False, shared=False, watched=False, loop_count=0
):
    """
    This function calculates the total points based on the user's actions.

    Parameters:
    interest_tag (str): The tag of the video.
    liked (bool): Whether the user liked the video.
    shared (bool): Whether the user shared the video.
    watched (bool): Whether the user watched the video.
    loop_count (int): The number of times the user looped the video.

    Returns:
    float: The total points calculated based on the user's actions.
    """
    # Actions and their corresponding points to add
    Actions = {
        "Like": Like,
        "Share": Share,
        "Watch": Watch,  # more than 50% of total duration
        "Loop": Loop,
    }

    total_action_points = 0

    if liked:
        total_action_points += Actions["Like"]
    if shared:
        total_action_points += Actions["Share"]
    if watched:
        total_action_points += Actions["Watch"]
    total_action_points += loop_count * Actions["Loop"]

    print(f"\nUpdating weights for interest tag: {interest_tag}")
    print(f"Total points to add: {total_action_points}")

    return total_action_points

In [77]:
def recalculate_percentages(user_dict_interest):
    """
    Recalculate the percentage of each interest relative to the total weight.
    """
    total_weight = sum(user_dict_interest.values())

    user_dict_percentage = {}

    for interest_tag, weight in user_dict_interest.items():
        user_dict_percentage[interest_tag] = (weight / total_weight) * 100

    return user_dict_percentage

In [78]:
def recalculate_percentages(user_dict_interest):
    """
    Recalculate the percentage of each interest relative to the total weight.
    """
    total_weight = sum(user_dict_interest.values())

    user_dict_percentage = {}

    for interest_tag, weight in user_dict_interest.items():
        user_dict_percentage[interest_tag] = (weight / total_weight) * 100

    print(f"Percentage of each interest relative to the total weight: {user_dict_percentage}. Total percentage: {sum(user_dict_percentage.values())}")
    return user_dict_percentage

In [79]:
def handle_interruptions(user_dict_percentage):
    """
    Handle intentional interruptions and suggest relevant interests (50% dominance) or recommend more of the same interest.

    Parameters:
    user_dict_percentage (dict): A dictionary where keys are interest tags and values are their percentages.
    """
    # Get the interest with the highest percentage
    interest_tag, interest_percentage = max(user_dict_percentage.items(), key=lambda item: item[1])

    if interest_percentage > 75:
        print(f"Have you been enjoying {interest_tag} so far?")
        response = input()  # Get user's response
        if response.lower() == "yes":
            print("Recommend less of the same interest.")
            # Add checkbox code to recommend more videos of the same interest

    elif interest_percentage > 50:
        print("Suggesting relevant interests...")
        # Add slider code

    print(f"The highest tag is {interest_tag}. It dominates the interest weights by {interest_percentage}%.")

In [80]:
def note_actions_and_update_weights(
    user_dict_interest,
    interest_tag,
    liked=False,
    shared=False,
    watched=False,
    loop_count=0,
):
    total_action_points = observe_action_taken(
        interest_tag, liked, shared, watched, loop_count
    )

    # Update raw weights
    if interest_tag in user_dict_interest:
        user_dict_interest[interest_tag] += total_action_points
    else:
        user_dict_interest[interest_tag] = starting_weight

    # Recalculate percentages
    user_dict_percentage = recalculate_percentages(user_dict_interest)

    # Handle interruptions
    handle_interruptions(user_dict_percentage, interest_tag)

    # Sort the interests ("key=item[1]") by weight in descending (high to low) order and keep only the top 10
    user_dict_interest = dict(
        sorted(user_dict_interest.items(), key=lambda item: item[1], reverse=True)[:10]
    )

    return user_dict_interest

In [81]:
def note_actions_and_update_weights(
    user_dict_interest,
    interest_tag,
    liked=False,
    shared=False,
    watched=False,
    loop_count=0,
):
    total_action_points = observe_action_taken(
        interest_tag, liked, shared, watched, loop_count
    )

    # Update raw weights
    if interest_tag in user_dict_interest:
        user_dict_interest[interest_tag] += total_action_points
    else:
        user_dict_interest[interest_tag] = starting_weight

    # Recalculate percentages
    user_dict_percentage = recalculate_percentages(user_dict_interest)

    # Handle interruptions
    handle_interruptions(user_dict_percentage)

    # Sort the interests ("key=item[1]") by weight in descending order and keep only the top 10
    user_dict_interest = dict(
        sorted(user_dict_interest.items(), key=lambda item: item[1], reverse=True)[:10]
    )

    return user_dict_interest

In [82]:
# Example usage
user1_interests = ["DevOps", "Tests (SAT)", "Machine Learning", "Cats", "Dogs"] # count: 4
user_interests = create_user_interest_dict(user1_interests)
print("Initial user-chosen interests\n")
pprint(f"{user_interests}")

user_interests = note_actions_and_update_weights(user_interests, "Cats", liked=True)
pprint(f"Updated interests: {user_interests}")


Initial user-chosen interests

("{'DevOps': 10, 'Tests (SAT)': 10, 'Machine Learning': 10, 'Cats': 10, "
 "'Dogs': 10, 'Random': 10}")

Updating weights for interest tag: Cats
Total points to add: 1
Percentage of each interest relative to the total weight: {'DevOps': 16.39344262295082, 'Tests (SAT)': 16.39344262295082, 'Machine Learning': 16.39344262295082, 'Cats': 18.0327868852459, 'Dogs': 16.39344262295082, 'Random': 16.39344262295082}. Total percentage: 99.99999999999999
The highest tag is Cats. It dominates the interest weights by 18.0327868852459%.
("Updated interests: {'Cats': 11, 'DevOps': 10, 'Tests (SAT)': 10, 'Machine "
 "Learning': 10, 'Dogs': 10, 'Random': 10}")


In [83]:
def get_interest_tag_for_recommendation(user_dict_interest):
    # Recalculate percentages
    user_dict_percentage = recalculate_percentages(user_dict_interest)

    # Generate a random number between 0 and 100
    random_value = random.randint(0, 100)

    print("_" * 50)
    print(f"\nRandom value: {random_value}\n")

    cumulative_percentage = 0

    print("Based on User_Dictionary_Percentage (%)\n")

    for interest_tag, percentage in user_dict_percentage.items():
        # Save the starting point of the current tag's range (in percentage)
        previous_cumulative_percentage = cumulative_percentage

        # Add the tag's percentage to the cumulative total to get the end point of the tag's range
        cumulative_percentage += percentage

        print(f"Checking interest: {interest_tag}, cumulative range: {int(previous_cumulative_percentage)} - {int(cumulative_percentage)}")

        # If the random value is less than or equal to the cumulative percentage up to the current interest,
        # select the current interest and stop looking at the rest of the interests.
        if random_value <= cumulative_percentage:
            print(f"\nSelected interest: {interest_tag}\n")
            break

    if interest_tag == "Random":
        interest_tag = random.choice(list(user_dict_percentage.keys()))
        print(f"!!Random!! tag selected: {interest_tag}")
        print("MODIFY THIS LATER TO POINT TO CSV COLUMN")
        return interest_tag

    return interest_tag

In [84]:
random_tag = get_interest_tag_for_recommendation(user_interests)
print(f"Randomly selected tag for recommendation: {random_tag}")

Percentage of each interest relative to the total weight: {'Cats': 18.0327868852459, 'DevOps': 16.39344262295082, 'Tests (SAT)': 16.39344262295082, 'Machine Learning': 16.39344262295082, 'Dogs': 16.39344262295082, 'Random': 16.39344262295082}. Total percentage: 99.99999999999999
__________________________________________________

Random value: 91

Based on User_Dictionary_Percentage (%)

Checking interest: Cats, cumulative range: 0 - 18
Checking interest: DevOps, cumulative range: 18 - 34
Checking interest: Tests (SAT), cumulative range: 34 - 50
Checking interest: Machine Learning, cumulative range: 50 - 67
Checking interest: Dogs, cumulative range: 67 - 83
Checking interest: Random, cumulative range: 83 - 99

Selected interest: Random

!!Random!! tag selected: Cats
MODIFY THIS LATER TO POINT TO CSV COLUMN
Randomly selected tag for recommendation: Cats


In [85]:
print(f"User Dict Interest: {user_interests}\n\nPercentages: {recalculate_percentages(user_interests)}")

Percentage of each interest relative to the total weight: {'Cats': 18.0327868852459, 'DevOps': 16.39344262295082, 'Tests (SAT)': 16.39344262295082, 'Machine Learning': 16.39344262295082, 'Dogs': 16.39344262295082, 'Random': 16.39344262295082}. Total percentage: 99.99999999999999
User Dict Interest: {'Cats': 11, 'DevOps': 10, 'Tests (SAT)': 10, 'Machine Learning': 10, 'Dogs': 10, 'Random': 10}

Percentages: {'Cats': 18.0327868852459, 'DevOps': 16.39344262295082, 'Tests (SAT)': 16.39344262295082, 'Machine Learning': 16.39344262295082, 'Dogs': 16.39344262295082, 'Random': 16.39344262295082}


___
# Ranking Algorithm


In [None]:
# Function for adding new rows to the collection

# Convert the 'tags' column to a list of strings (grouped according to its respective row)
documents = df["tags"].apply(lambda x: x.split(",")).tolist()
documents_str = [", ".join(doc) for doc in documents]

# Add the documents to the collection
# todo: handle UUIDs
ids = [str(i + 1) for i in range(len(documents_str))]

# Add new rows to the collection
collection.upsert(documents=documents_str, ids=ids)

In [None]:
# Query the collection for the top 3 most similar videos

results = collection.query(query_texts=["Shiba"], n_results=3)

# Print the results
pprint(results)

# Get the ID of the first result
result_id = int(results["ids"][0][0])

# Get the title corresponding to the ID
title = df.loc[df["video_id"] == result_id, "video_title"].values[0]

print(f"\nTitle: {title}")

{'data': None,
 'distances': [[0.7705810511945834, 0.7818170785903931, 0.7819685339927673]],
 'documents': [['Cryptocurrencies,  Bitcoin,  How it works,  Blockchain '
                'technology,  Digital currency',
                'Python,  shelve library,  data storage,  key-value store,  '
                'file persistence',
                'Python,  shelve library,  data storage,  key-value store,  '
                'file persistence']],
 'embeddings': None,
 'ids': [['628', '270', '132']],
 'metadatas': [[None, None, None]],
 'uris': None}

Title: Cryptocurrencies: How Bitcoin Works


In [None]:
# Usage
watched = {} # Should be in a class

# Functions handling the watched videos dictionary

def add_to_watched(video_id, title, watched_dict):
    watched_dict[video_id] = title
    print(f'"{title}" with ID {video_id} has been added to watched videos.')
    return watched_dict

def show_to_user_again(video_id, title, watched_dict):
    watched_dict.pop(video_id)
    print(f'"{title}" with ID {video_id} will be shown to user again in the future.')
    return watched_dict

add_to_watched(result_id, title, watched)
print(watched)
show_to_user_again(result_id, title, watched)
print(watched)

"Cryptocurrencies: How Bitcoin Works" with ID 628 has been added to watched videos.
{628: 'Cryptocurrencies: How Bitcoin Works'}
"Cryptocurrencies: How Bitcoin Works" with ID 628 will be shown to user again in the future.
{}


In [None]:
# Initialize the queue
queue = {} # should be in class

# Function to add videos to the queue
def add_to_queue(interest_tag, n_results=5):
    # Use ChromaDB to query top 5 videos closest to given interest
    results = collection.query(query_texts=[interest_tag], n_results=n_results)
    print(f'Queried top {n_results} videos for tag "{interest_tag}"')  # Debugging line

    # Add the videos to the queue
    for i in range(n_results):
        result_id = int(results["ids"][0][i])
        title = df.loc[df["video_id"] == result_id, "video_title"].values[0]
        tag = df.loc[df["video_id"] == result_id, "tags"].values[0].split(',')[0]  # get the first tag

        # Check if the video has already been watched by user
        if result_id not in watched:
            queue[result_id] = tag
            print(f'Added video with ID {result_id} and tag "{tag}" to the queue')  # Debugging line


    return queue

In [None]:
queue = {} # should be in class

In [None]:
add_to_queue("Date")

Queried top 5 videos for tag "Date"
Added video with ID 606 and tag "javascript" to the queue
Added video with ID 591 and tag "ruby" to the queue
Added video with ID 562 and tag "Python" to the queue
Added video with ID 525 and tag "SQL" to the queue
Added video with ID 570 and tag "Ruby" to the queue


{606: 'javascript', 591: 'ruby', 562: 'Python', 525: 'SQL', 570: 'Ruby'}

In [88]:
# Recommend and Queue videos function (MAIN)

# Function to view a video
def view_video(queue, user_dict_interest, watched_dict):

    # Pop the first video from the queue
    video_id, interest_tag = next(iter(queue.items()))
    del queue[video_id]

    # Add the video to the watched videos
    add_to_watched(video_id, title, watched_dict)

    # Update the user's interest weights based on the watched video
    note_actions_and_update_weights(user_dict_interest, interest_tag) # SIMULATION - User took no action

    # If the queue length drops to 2, get random tag and add more videos to the queue
    if len(queue) <= 2:
        print("\nAdding more videos to the queue...\n")
        new_interest_tag = get_interest_tag_for_recommendation(user_dict_interest)
        add_to_queue(new_interest_tag, count_queued_videos)

    return queue

# Add the first 5 videos to the queue
view_video(queue, user_interests, watched)

print()
pprint(user_interests)

"Cryptocurrencies: How Bitcoin Works" with ID 591 has been added to watched videos.

Updating weights for interest tag: ruby
Total points to add: 0
Percentage of each interest relative to the total weight: {'Cats': 15.492957746478872, 'DevOps': 14.084507042253522, 'Tests (SAT)': 14.084507042253522, 'Machine Learning': 14.084507042253522, 'Dogs': 14.084507042253522, 'Random': 14.084507042253522, 'ruby': 14.084507042253522}. Total percentage: 100.0
The highest tag is Cats. It dominates the interest weights by 15.492957746478872%.

{'Cats': 11,
 'DevOps': 10,
 'Dogs': 10,
 'Machine Learning': 10,
 'Random': 10,
 'Tests (SAT)': 10,
 'ruby': 10}


# #BreakItDown - Tasks

### 1. Mark video as watched ✅
- If ID exist
- query result skip over id and check again if ID exists
- repeat until not 

### 2. Revised percentage calculation (relative instead of 100%) ✅
Will cause a tag to be dominant tho (weight 1000 while others still 10). Mitigated with intentional interruptions (#4). 

### 3. User class schema ✅
- self
- interest_weights (dict)
- watched_videos (dict)

### 4. Intentional Interruptions ✅
Handle dominations
- 50% -> Suggest relevant interests 
- 75% -> "Have you been enjoying {tag name} so far? "yes". "recommend more". 

### 5. Modify whisper prompt to return most relevant tag first then extract that for on_user_swipe()
in [github](https://github.com/edgurinc/edgur/blob/main/backend/accounts/videoprocessors.py)
- To help us later in handling which tag to add weights to given videos have multiple tags. 

### 6. Decay other interests whenever watching a video ✅
except current video tag 

### 7. Handle irrelevant tags
Acknowledge that it will be filled soon, such as "food" category

### 8. "Cache" next 5 videos ✅
Run uniformDist+reco only on 3rd video 

### 9. Min selection Max selection for categories ✅
Like tiktok's

### 10. Show this to me again (#SpacedRepetition) ✅
Remove from watched dictionary

In [None]:
# Update relative calculation (100)

# User schema
    # Dict of Interests
    # Dict `watched`
    # Dict `queue`

# Research Relational database
    # Tables how do we store?