_____
# MAIN SIM

In [25]:
from pprint import pprint
import pandas as pd
import random
import chromadb
from chromadb.utils import embedding_functions

# Constants
NEW_VIDEO_WEIGHT = 10
STARTING_WEIGHT = 10

LIKE = 1
SHARE = 2
WATCH = 1  # more than 50% of total duration
LOOP = 1

USER_MIN_INTEREST = 4
USER_MAX_INTEREST = 8
COUNT_QUEUED_VIDEOS = 5

# AI-Related Constants
OPENAI_API_KEY = "sk-"
OPENAI_EMBEDDING_MODEL = "text-embedding-3-small"

In [26]:
# Initialize ChromaDB client and collection
chroma_client = chromadb.PersistentClient(path="db")
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=OPENAI_API_KEY,
    model_name=OPENAI_EMBEDDING_MODEL
)

collection = chroma_client.get_or_create_collection(
    name="Edgur_Video_DB_Vectorstore",
    embedding_function=openai_ef,
    metadata={"hnsw:space": "cosine"},
)

# Load data from Google Sheets
DB_URL = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSAE2tBAnAdXsxk9a9YClFN7MSEVhzEmJD01ewwtooMLxL-Ilod26EbdD8sZeZk0ybiqD-jqT-9RZbn/pub?gid=497214901&single=true&output=csv"
df = pd.read_csv(DB_URL)

In [30]:
# Functions

def create_user_interest_dict(user_interest_list):
    if len(user_interest_list) < USER_MIN_INTEREST:
        raise ValueError("You must have at least 4 interests")
    elif len(user_interest_list) > USER_MAX_INTEREST:
        raise ValueError("You may have 8 interests at most")

    user_dict_interest = {interest: STARTING_WEIGHT for interest in user_interest_list}
    user_dict_interest["Random"] = STARTING_WEIGHT
    return user_dict_interest

def observe_action_taken(interest_tag, liked=False, shared=False, watched=False, loop_count=0):
    Actions = {"Like": LIKE, "Share": SHARE, "Watch": WATCH, "Loop": LOOP}
    total_action_points = 0
    if liked:
        total_action_points += Actions["Like"]
    if shared:
        total_action_points += Actions["Share"]
    if watched:
        total_action_points += Actions["Watch"]
    total_action_points += loop_count * Actions["Loop"]
    return total_action_points

def recalculate_percentages(user_dict_interest):
    total_weight = sum(user_dict_interest.values())
    user_dict_percentage = {interest_tag: (weight / total_weight) * 100 for interest_tag, weight in user_dict_interest.items()}
    return user_dict_percentage

def handle_interruptions(user_dict_percentage):
    interest_tag, interest_percentage = max(user_dict_percentage.items(), key=lambda item: item[1])
    if interest_percentage > 75:
        print(f"Have you been enjoying {interest_tag} so far?")
        response = input()
        if response.lower() == "yes":
            print("Recommend less of the same interest.")
    elif interest_percentage > 50:
        print("Suggesting relevant interests...")
    print(f"The highest tag is {interest_tag}. It dominates the interest weights by {interest_percentage}%.")

def note_actions_and_update_weights(user_dict_interest, interest_tag, liked=False, shared=False, watched=False, loop_count=0):
    total_action_points = observe_action_taken(interest_tag, liked, shared, watched, loop_count)
    if interest_tag in user_dict_interest:
        user_dict_interest[interest_tag] += total_action_points
    else:
        user_dict_interest[interest_tag] = STARTING_WEIGHT
    user_dict_percentage = recalculate_percentages(user_dict_interest)
    handle_interruptions(user_dict_percentage)
    user_dict_interest = dict(sorted(user_dict_interest.items(), key=lambda item: item[1], reverse=True)[:10])
    return user_dict_interest

def get_interest_tag_for_recommendation(user_dict_interest):
    user_dict_percentage = recalculate_percentages(user_dict_interest)
    random_value = random.randint(0, 100)
    cumulative_percentage = 0
    for interest_tag, percentage in user_dict_percentage.items():
        previous_cumulative_percentage = cumulative_percentage
        cumulative_percentage += percentage
        if random_value <= cumulative_percentage:
            break
    if interest_tag == "Random":
        interest_tag = random.choice(list(user_dict_percentage.keys()))
        return interest_tag
    return interest_tag

def add_to_watched(video_id, title, watched_dict):
    watched_dict[video_id] = title
    print(f'"{title}" with ID {video_id} has been added to watched videos.')
    return watched_dict

def show_to_user_again(video_id, title, watched_dict):
    watched_dict.pop(video_id)
    print(f'"{title}" with ID {video_id} will be shown to user again in the future.')
    return watched_dict

def add_to_queue(interest_tag, n_results=5):
    results = collection.query(query_texts=[interest_tag], n_results=n_results)
    print(f'Queried top {n_results} videos for tag "{interest_tag}"')
    for i in range(n_results):
        result_id = int(results["ids"][0][i])
        title = df.loc[df["video_id"] == result_id, "video_title"].values[0]
        tag = df.loc[df["video_id"] == result_id, "tags"].values[0].split(',')[0]
        if result_id not in watched:
            queue[result_id] = tag
            print(f'Added "{title}" with ID {result_id} and tag "{tag}" to the queue')
    return queue

def view_video(queue, user_dict_interest, watched_dict):
    if not queue:
        print("Queue is empty! Adding more videos...")
        new_interest_tag = get_interest_tag_for_recommendation(user_dict_interest)
        add_to_queue(new_interest_tag, COUNT_QUEUED_VIDEOS)

    video_id, interest_tag = next(iter(queue.items()))
    del queue[video_id]
    title = df.loc[df["video_id"] == video_id, "video_title"].values[0]
    add_to_watched(video_id, title, watched_dict)
    note_actions_and_update_weights(user_dict_interest, interest_tag)
    if len(queue) <= 2:
        print("\nAdding more videos to the queue...\n")
        new_interest_tag = get_interest_tag_for_recommendation(user_dict_interest)
        add_to_queue(new_interest_tag, COUNT_QUEUED_VIDEOS)
    return queue

In [31]:
# Main Simulation

def main_simulation(user_interests, initial_interest_tag, num_iterations=5):
    user_dict_interest = create_user_interest_dict(user_interests)
    watched = {}
    queue = add_to_queue(initial_interest_tag)

    for _ in range(num_iterations):
        print("\n=== Viewing Next Video ===")
        queue = view_video(queue, user_dict_interest, watched)
        print(f"\nUpdated Queue: {queue}")
        pprint(user_dict_interest)
        print(f"Watched Videos: {watched}")
        print("\n=========================\n")

In [33]:
# Example user interests and initial setup
user_interests = ["DevOps", "Tests (SAT)", "Machine Learning", "Cats", "Cooking"]
initial_interest_tag = "Shiba"

# Run the main simulation
main_simulation(user_interests, initial_interest_tag)

Queried top 5 videos for tag "Shiba"
Added "Cryptocurrencies: How Bitcoin Works" with ID 628 and tag "Cryptocurrencies" to the queue
Added "Manipulating Data Frames with Pandas in Ruby" with ID 526 and tag "Pandas" to the queue
Added "Understanding Stock Markets: Basics of Buying and Selling" with ID 625 and tag "Stock markets" to the queue
Added "Basic Exploratory Data Analysis with Pandas" with ID 372 and tag "Pandas" to the queue
Added "Basic Exploratory Data Analysis with Pandas" with ID 70 and tag "Pandas" to the queue

=== Viewing Next Video ===
"Understanding T-Tests in Statistics" with ID 61 has been added to watched videos.
The highest tag is DevOps. It dominates the interest weights by 14.285714285714285%.

Updated Queue: {366: 'T-Tests', 577: 'Ruby', 259: 'Confidence Intervals', 411: 'Confidence Intervals', 628: 'Cryptocurrencies', 526: 'Pandas', 625: 'Stock markets', 372: 'Pandas', 70: 'Pandas'}
{'Cats': 10,
 'Cooking': 10,
 'DevOps': 10,
 'Machine Learning': 10,
 'Random':