In [0]:
!pip install supabase

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# !pip install sentence_transformers # this takes quite a long time, at least 3 min

In [0]:
%restart_python

In [0]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from supabase import create_client, Client
from sklearn.feature_extraction.text import TfidfTransformer
from datetime import datetime, timedelta

In [0]:

# get url and key using databricks secrets
supabase_url = dbutils.secrets.get(scope='rippl', key='supabase_url')
supabase_key = dbutils.secrets.get(scope='rippl', key='supabase_api_key')

# make connection with supabase backend
supabase : Client = create_client(supabase_url, supabase_key)

In [0]:
# declaring functions cell block


def pull_full_table(table_name):
    # got tired of writing this out multiple times so even though it's more clunkly in jupyter notebooks I made a function for this

    # error handling
    # can I check if table doesn't exist in supabase? 

    # get total count of rows
    response = supabase.table(table_name).select("*", count="exact").execute()
    total_count = response.count

    # calculate num_batches
    num_batches = (total_count // 1000) + 1 # can only pull 1000 per batch 

    # print outputs to validate
    print(f"Total Rows in Table: {total_count}")
    print(f"Num Batches to get Total Rows {num_batches}")

    # actually do the loop

    for batch in range(1, num_batches + 1): # start at 1 (or start variable equation doesn't work)
        
        # get start and end of 'range' of rows to call in table
        start = (batch - 1) * 1000  # eg. first batch, will be 0
        end = (batch * 1000) - 1 # eg. first batch, will be 999 (unless less than 999)
        if  end > total_count:
            end = total_count
        
        # print outputs to validate
        print(f"For batch: {batch}, start: {start}, end: {end}")

        # get rows in range(start, end)
        temp_response = supabase.table(table_name).select("*").range(start, end).execute() 

        # get data
        temp_data = temp_response.data
        
        # convert to pandas dataframe for easy handling
        temp_df = pd.DataFrame(temp_data)

        # concatenate (or if batch one, copy)
        if batch == 1:
            result_df = temp_df.copy()

        else:
            result_df = pd.concat([result_df, temp_df], axis=0)

    # reset index and remove prior index
    result_df.reset_index(drop=True, inplace=True)

    return result_df

def delete_table(table_name):
    # so apparently whenever supabase runs a delete command it needs a 'where' command
    # but the way around this is just to run the neq command -- which I guess is looking for a case where 
    # user_id is the 'dummy_id' to make sure it doesn't delete that one
    # but because the dummy_id is intentionally something that shouldn't be a uuid
    # it ends up deleting the whole table 
    dummy_id = "00000000-0000-0000-0000-000000000000"
    response = supabase.table(table_name).delete().neq("user_id", dummy_id).execute()
    print("Deletion response:", response)

def get_similarity_df(user_interest_matrix_unprocessed):
    # attempting to use tfidf vectors to do simple content filtering 
    user_interest_matrix_tfidf = user_interest_matrix_unprocessed.copy()
    user_interest_matrix_tfidf.fillna(0, inplace=True)
    user_interest_matrix_tfidf

    tfidf_transformer = TfidfTransformer()
    user_interests_tfidf = tfidf_transformer.fit_transform(user_interest_matrix_tfidf.values).toarray()
    user_interests_tfidf

    user_vectors = user_interest_matrix_tfidf.values
    # seems dimensionality depends on the vocabulary size? Not sure how to make this a fixed number? 
    similarity_matrix = cosine_similarity(user_vectors, user_vectors)
    user_ids = user_interest_matrix_tfidf.index.tolist()
    similarity_df = pd.DataFrame(similarity_matrix, index=user_ids, columns=user_ids)
    return similarity_df


def remove_duplicate_interest_recommendations(new_df_original, old_df_original):
    
    #creating copies so I don't accidentally 
    new_df = new_df_original.copy()
    old_df = old_df_original.copy()

    if new_df.empty:
        print("new df empty, returning old")
        return old_df
    elif old_df.empty:
        print("old df empty, returning new")
        return new_df
    
    # converts all user_ids and recommended_user_ids to strings and strips trailing and leading whitespaces
    for col in ['user_id', 'recommended_interest_id']:
        new_df[col] = new_df[col].astype(str).str.strip()
        old_df[col] = old_df[col].astype(str).str.strip()

    old_pairs = set(zip(old_df['user_id'], old_df['recommended_interest_id']))

    mask = new_df.apply(lambda row: (row['user_id'], row['recommended_interest_id']) not in old_pairs, axis=1)

    if not mask.any():
        print("new df is all duplicates of old df, returning old df")
        return old_df
    
    # applying mask to original new_df
    filtered_new_df = new_df_original[mask].reset_index(drop=True)
    
    return filtered_new_df


def check_oldest_timestamp(old_recs_df, day_threshold=7):
    old_copy = old_recs_df.copy()
    
    old_copy['timestamp'] = pd.to_datetime(old_copy['timestamp'], errors='coerce')

    oldest_ts = old_copy['timestamp'].min()


    day_threshold = datetime.now() - timedelta(days=day_threshold)

    if oldest_ts <= day_threshold:
        print("old_recs_df is older than day threshold set, it should be deleted")
        return True
    else:
        print("old_recs_df is not older than day threshold set, it should be kept")
        return False

def delete_table(table_name):
    # so apparently whenever supabase runs a delete command it needs a 'where' command
    # but the way around this is just to run the neq command -- which I guess is looking for a case where 
    # user_id is the 'dummy_id' to make sure it doesn't delete that one
    # but because the dummy_id is intentionally something that shouldn't be a uuid
    # it ends up deleting the whole table 
    dummy_id = "00000000-0000-0000-0000-000000000000"
    response = supabase.table(table_name).delete().neq("user_id", dummy_id).execute()
    print("Deletion response:", response)

In [0]:

users_df = pull_full_table('users')
users_df

Total Rows in Table: 20
Num Batches to get Total Rows 1
For batch: 1, start: 0, end: 20


Unnamed: 0,id,name,image,description,email,created_at
0,897a4d99-a0d4-474a-b34d-78ab1e062a93,Sierra Trailson,[REDACTED]/stora...,Weekend wanderer and sunrise seeker. I hike to...,abrielle@rippl.world,2025-04-14T02:17:13.164935
1,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,Gregory Mcgregerson,[REDACTED]/stora...,Hi,johnrich398@gmail.com,2025-04-11T04:11:37.742094
2,a46c20e6-1c11-4a59-a70c-1cc22862885d,Savanna Davis,[REDACTED]/stora...,I love video games!,saavannaa.rose@gmail.com,2025-04-09T23:31:03.00942
3,449bc0ab-6942-426e-b573-9cb690ffd1b9,Sydney Blake,[REDACTED]/stora...,I live for fresh powder runs and cozy evenings...,dfalsabrook@gmail.com,2025-04-10T03:35:57.447764
4,3fb510a5-58a0-4994-81ba-29e69fa79db1,David Meddie,[REDACTED]/stora...,I really like cars!,david.meddaugh@atlasschool.com,2025-04-10T20:14:04.190027
5,c2c9fd5e-2040-4ee7-92ec-4bd65a467a27,Cody,[REDACTED]/stora...,,codywalenciak@gmail.com,2025-04-10T20:38:11.824933
6,740f8541-0638-4348-8fd9-72453613be4e,Natalie Baker,[REDACTED]/stora...,Lover of sweet treats and scenic trails. You’l...,abrielleperry22@icloud.com,2025-04-13T03:15:09.953282
7,7ca36877-fc2f-478e-951d-b4af878f25ef,Blake Renshaw,[REDACTED]/stora...,"Grew up in the garage, raised on horsepower an...",juvx6bpas6@knmcadibav.com,2025-04-15T17:12:07.099672
8,ecb95c11-4923-42d0-9a45-01eb27cb832a,Mike Rodgers,[REDACTED]/stora...,Old soul with a love for all things that move ...,david.alasbrook@atlasschool.com,2025-04-10T20:19:26.250657
9,3e6acdeb-aa9a-47f1-879d-d7139eb98e2f,Carson Ridge,[REDACTED]/stora...,I’m a sculptor and ceramic artist who finds pe...,david.a@rippl.world,2025-04-10T04:13:53.414479


In [0]:
user_interest_df = pull_full_table("user_interests")
user_interest_df

Total Rows in Table: 152
Num Batches to get Total Rows 1
For batch: 1, start: 0, end: 152


Unnamed: 0,user_id,interest_id
0,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,15bb36f9-7466-4e83-a148-5096114cce9e
1,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,62b86242-7014-499f-9cb9-42cfca036de9
2,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,6663208a-593c-4c58-8df2-1cf2fda3cce6
3,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,872755d5-698f-4b66-8e80-098c01ca96eb
4,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,96081371-66ce-49ef-adf7-a1f90b47fdb3
...,...,...
147,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,6117a58e-f12f-4af1-b031-da9fa32d0742
148,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,7ee3f21b-2e73-4ddd-b6ab-0ae7d7f60a5c
149,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,baa13863-6091-4872-ba3d-b29a09af0d05
150,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,cbf7c5eb-44e3-4364-b5cd-a0ecd045ee3d


In [0]:
interests_df = pull_full_table("interests")
interests_df

Total Rows in Table: 47
Num Batches to get Total Rows 1
For batch: 1, start: 0, end: 47


Unnamed: 0,id,name,category_id
0,03f7dc13-cd91-4eaf-95c2-59dc783c8e03,Math,324b188f-361a-416d-9f58-83b9a2e8606c
1,06b52f51-32a3-4921-948c-23cb1f474ca3,Pottery,53ea2977-9a06-4445-bc95-b6d8acf72010
2,0f790fe0-7859-4408-9b98-c8cee119d659,Knitting,53ea2977-9a06-4445-bc95-b6d8acf72010
3,15bb36f9-7466-4e83-a148-5096114cce9e,Yoga,3e50575e-4896-4cf1-a98b-35f629474335
4,17a74d46-88b0-40a8-afa8-bbca0d79d285,Rollerskating,384da8b9-33f2-4f22-83c7-fc05d8bf2298
5,1820882b-7561-4f92-9f8d-8c241be21cf6,Skiing,384da8b9-33f2-4f22-83c7-fc05d8bf2298
6,1888b454-b69e-46bf-bc92-24e6a1085518,Playing Instruments,9d5bd5f2-ec1d-46ca-9c75-0965c676d465
7,1f85d55e-0b0c-4dd7-a9b0-82e4cb699145,Cardio,3e50575e-4896-4cf1-a98b-35f629474335
8,31242d1e-66fe-43fb-8d48-06e751c50e32,Studying,324b188f-361a-416d-9f58-83b9a2e8606c
9,40ddb76d-be61-4b45-9ec1-1d917e3a08c0,Sewing,53ea2977-9a06-4445-bc95-b6d8acf72010


In [0]:
n_users = users_df.index.max() # current number of users
print("n_users: ", n_users)
n_interests = interests_df.index.max() # current number of interests
print("n_interests: ", n_interests)

n_users:  19
n_interests:  46


In [0]:


new_interest_id_map = {}
j = 0
for old in interests_df['id']:
  new_interest_id_map[old] = j
  j += 1

new_user_id_map = {}
i = 0
for old in users_df['id']:
    new_user_id_map[old] = i
    i += 1

new_interest_id_map

{'03f7dc13-cd91-4eaf-95c2-59dc783c8e03': 0,
 '06b52f51-32a3-4921-948c-23cb1f474ca3': 1,
 '0f790fe0-7859-4408-9b98-c8cee119d659': 2,
 '15bb36f9-7466-4e83-a148-5096114cce9e': 3,
 '17a74d46-88b0-40a8-afa8-bbca0d79d285': 4,
 '1820882b-7561-4f92-9f8d-8c241be21cf6': 5,
 '1888b454-b69e-46bf-bc92-24e6a1085518': 6,
 '1f85d55e-0b0c-4dd7-a9b0-82e4cb699145': 7,
 '31242d1e-66fe-43fb-8d48-06e751c50e32': 8,
 '40ddb76d-be61-4b45-9ec1-1d917e3a08c0': 9,
 '43888455-55cf-4c09-ad26-df7d4673e19c': 10,
 '44227a36-49a0-4f30-94dd-3dc1c4e0a951': 11,
 '4b57788f-1d57-4286-b760-14e6fbccf2f6': 12,
 '5bf4236f-11ed-4332-8bb5-c0cc009adc99': 13,
 '6117a58e-f12f-4af1-b031-da9fa32d0742': 14,
 '62b86242-7014-499f-9cb9-42cfca036de9': 15,
 '6663208a-593c-4c58-8df2-1cf2fda3cce6': 16,
 '6ad40466-14f4-4bbd-8d9c-a17590ab2f2c': 17,
 '6c22dd13-897c-4b98-95d0-32dda931a2d3': 18,
 '72a885d5-db6e-4148-9d2a-09347d948451': 19,
 '7e21b3d7-9a17-4e2a-aa69-a4feba8e7c84': 20,
 '7ee3f21b-2e73-4ddd-b6ab-0ae7d7f60a5c': 21,
 '86b64472-50ea-4d2e

In [0]:
new_user_id_map

{'897a4d99-a0d4-474a-b34d-78ab1e062a93': 0,
 '02c3c862-b8a5-4fde-ac56-e3b5545f18d6': 1,
 'a46c20e6-1c11-4a59-a70c-1cc22862885d': 2,
 '449bc0ab-6942-426e-b573-9cb690ffd1b9': 3,
 '3fb510a5-58a0-4994-81ba-29e69fa79db1': 4,
 'c2c9fd5e-2040-4ee7-92ec-4bd65a467a27': 5,
 '740f8541-0638-4348-8fd9-72453613be4e': 6,
 '7ca36877-fc2f-478e-951d-b4af878f25ef': 7,
 'ecb95c11-4923-42d0-9a45-01eb27cb832a': 8,
 '3e6acdeb-aa9a-47f1-879d-d7139eb98e2f': 9,
 'e13ae6de-0656-493c-9165-80c1b0cd9bf6': 10,
 'ec03cce8-e743-45af-97e9-71bc05588376': 11,
 'f8f9ec17-fb66-4689-abc2-529d61dfb1f4': 12,
 'eb88b8b4-bdea-4aa0-af25-ec728d4083e5': 13,
 '0b7249ca-cc82-4ff1-9d50-d8d61280b717': 14,
 '436c7907-3ded-4313-832a-831fd3259848': 15,
 '76e52c42-0d73-42b7-95c4-20aadcd886ae': 16,
 '45365d21-9977-4505-a0f0-7f090bc33747': 17,
 'a99e13b8-fc64-48c6-a502-72e91972c107': 18,
 '355e5f7c-87fa-45c4-a218-ff904b7a1128': 19}

In [0]:
user_interest_df['has_interest'] = 1
user_interest_df

Unnamed: 0,user_id,interest_id,has_interest
0,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,15bb36f9-7466-4e83-a148-5096114cce9e,1
1,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,62b86242-7014-499f-9cb9-42cfca036de9,1
2,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,6663208a-593c-4c58-8df2-1cf2fda3cce6,1
3,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,872755d5-698f-4b66-8e80-098c01ca96eb,1
4,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,96081371-66ce-49ef-adf7-a1f90b47fdb3,1
...,...,...,...
147,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,6117a58e-f12f-4af1-b031-da9fa32d0742,1
148,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,7ee3f21b-2e73-4ddd-b6ab-0ae7d7f60a5c,1
149,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,baa13863-6091-4872-ba3d-b29a09af0d05,1
150,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,cbf7c5eb-44e3-4364-b5cd-a0ecd045ee3d,1


In [0]:
user_interest_matrix_unprocessed = user_interest_df.pivot(index='user_id', columns='interest_id', values='has_interest')
user_interest_matrix_unprocessed

interest_id,03f7dc13-cd91-4eaf-95c2-59dc783c8e03,06b52f51-32a3-4921-948c-23cb1f474ca3,0f790fe0-7859-4408-9b98-c8cee119d659,15bb36f9-7466-4e83-a148-5096114cce9e,17a74d46-88b0-40a8-afa8-bbca0d79d285,1820882b-7561-4f92-9f8d-8c241be21cf6,1888b454-b69e-46bf-bc92-24e6a1085518,1f85d55e-0b0c-4dd7-a9b0-82e4cb699145,40ddb76d-be61-4b45-9ec1-1d917e3a08c0,43888455-55cf-4c09-ad26-df7d4673e19c,44227a36-49a0-4f30-94dd-3dc1c4e0a951,4b57788f-1d57-4286-b760-14e6fbccf2f6,5bf4236f-11ed-4332-8bb5-c0cc009adc99,6117a58e-f12f-4af1-b031-da9fa32d0742,62b86242-7014-499f-9cb9-42cfca036de9,6663208a-593c-4c58-8df2-1cf2fda3cce6,6c22dd13-897c-4b98-95d0-32dda931a2d3,72a885d5-db6e-4148-9d2a-09347d948451,7e21b3d7-9a17-4e2a-aa69-a4feba8e7c84,7ee3f21b-2e73-4ddd-b6ab-0ae7d7f60a5c,86b64472-50ea-4d2e-b062-69024291cad4,872755d5-698f-4b66-8e80-098c01ca96eb,882a94f2-33f1-412a-931f-96e9dc1b948b,88d66a6d-8bdb-413e-8a21-a2012c737f16,912f8c61-5ab3-4283-b0ad-015b0038ba23,93c2f1f1-c92f-4a33-af8e-669b40d1b34e,96081371-66ce-49ef-adf7-a1f90b47fdb3,98df11b0-8fe1-481f-b6e3-464a9b8c141a,9b9d4d70-dec0-46a9-b7ef-47a773735861,b0d74040-5f4f-4e22-b874-93a61352346f,b17e02df-e59b-463d-9bb9-17da4029df75,ba47674f-e078-4769-85d8-8e46e20d39c8,ba5bc1b4-501c-4fbd-899c-63375b9b5ca3,baa13863-6091-4872-ba3d-b29a09af0d05,c563e58a-aae7-4ff2-ba87-2fc1035168e2,cbf7c5eb-44e3-4364-b5cd-a0ecd045ee3d,d6d79c59-6338-47b0-b6b4-88bad3f0027e,e558275e-00d8-4958-a134-2ac23927b2da,e725fe78-8e89-45b2-a7a3-e41e7e5f6733,e7a6d63c-b7bf-49ae-8b15-0c9d4eee16f6,e8f7f688-a517-46a5-b708-3413bc78fb57,fea6ed7d-9c6b-44ac-9e8e-198ca9a681a2,ff459b3e-f2aa-4057-b7c2-8a8ddf98bd9d
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
02c3c862-b8a5-4fde-ac56-e3b5545f18d6,,,,1.0,,,,,,,,,,,1.0,1.0,,,,,,1.0,,,,,1.0,,,,,,1.0,,,,,,,,,,
0b7249ca-cc82-4ff1-9d50-d8d61280b717,,,,,1.0,,1.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,1.0,
355e5f7c-87fa-45c4-a218-ff904b7a1128,,,,,,,,,,,,1.0,,,,,1.0,,,,,,,,,1.0,,1.0,,,,,,1.0,,,1.0,,,,,,
3e6acdeb-aa9a-47f1-879d-d7139eb98e2f,,1.0,,1.0,,,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,1.0,,,
3fb510a5-58a0-4994-81ba-29e69fa79db1,,,,,,,,,,,,,,1.0,,,1.0,,,,,1.0,,1.0,,,1.0,1.0,,,,1.0,,,,,1.0,,,,,,
436c7907-3ded-4313-832a-831fd3259848,,1.0,1.0,,,,,,,1.0,,,,,,,,,1.0,,,,1.0,,,,,,,,1.0,,,,1.0,1.0,,1.0,,,,,
449bc0ab-6942-426e-b573-9cb690ffd1b9,,,,,,1.0,,,,1.0,,,1.0,,1.0,,,,,,,,,,,1.0,,1.0,,,,,,,,,,,,,,,
45365d21-9977-4505-a0f0-7f090bc33747,,,,,,,1.0,,1.0,1.0,1.0,,,1.0,,,,,1.0,,1.0,,,,,,,,,,,,,,1.0,,1.0,,,,,,
740f8541-0638-4348-8fd9-72453613be4e,,,,,,,,,,,,,,,,,,1.0,,,1.0,,,,,,,1.0,,,,,,,,,1.0,,,1.0,1.0,,
76e52c42-0d73-42b7-95c4-20aadcd886ae,,1.0,1.0,,,,,,,,,,,,,,,,,,,,1.0,,,1.0,,,,,,,,,,,1.0,1.0,,1.0,,,


In [0]:
similarity_df = get_similarity_df(user_interest_matrix_unprocessed)
similarity_df


Unnamed: 0,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,0b7249ca-cc82-4ff1-9d50-d8d61280b717,355e5f7c-87fa-45c4-a218-ff904b7a1128,3e6acdeb-aa9a-47f1-879d-d7139eb98e2f,3fb510a5-58a0-4994-81ba-29e69fa79db1,436c7907-3ded-4313-832a-831fd3259848,449bc0ab-6942-426e-b573-9cb690ffd1b9,45365d21-9977-4505-a0f0-7f090bc33747,740f8541-0638-4348-8fd9-72453613be4e,76e52c42-0d73-42b7-95c4-20aadcd886ae,7ca36877-fc2f-478e-951d-b4af878f25ef,897a4d99-a0d4-474a-b34d-78ab1e062a93,a46c20e6-1c11-4a59-a70c-1cc22862885d,a99e13b8-fc64-48c6-a502-72e91972c107,c2c9fd5e-2040-4ee7-92ec-4bd65a467a27,e13ae6de-0656-493c-9165-80c1b0cd9bf6,eb88b8b4-bdea-4aa0-af25-ec728d4083e5,ec03cce8-e743-45af-97e9-71bc05588376,ecb95c11-4923-42d0-9a45-01eb27cb832a,f8f9ec17-fb66-4689-abc2-529d61dfb1f4
02c3c862-b8a5-4fde-ac56-e3b5545f18d6,1.0,0.0,0.0,0.166667,0.288675,0.0,0.166667,0.0,0.0,0.0,0.166667,0.0,0.166667,0.166667,0.105409,0.166667,0.0,0.0,0.272166,0.0
0b7249ca-cc82-4ff1-9d50-d8d61280b717,0.0,1.0,0.0,0.0,0.0,0.272166,0.166667,0.408248,0.0,0.0,0.166667,0.136083,0.333333,0.333333,0.421637,0.166667,0.166667,0.339683,0.0,0.154303
355e5f7c-87fa-45c4-a218-ff904b7a1128,0.0,0.0,1.0,0.333333,0.433013,0.0,0.333333,0.136083,0.333333,0.308607,0.0,0.272166,0.0,0.0,0.105409,0.333333,0.166667,0.339683,0.272166,0.308607
3e6acdeb-aa9a-47f1-879d-d7139eb98e2f,0.166667,0.0,0.333333,1.0,0.288675,0.136083,0.166667,0.0,0.333333,0.308607,0.0,0.272166,0.166667,0.166667,0.210819,0.0,0.166667,0.226455,0.136083,0.0
3fb510a5-58a0-4994-81ba-29e69fa79db1,0.288675,0.0,0.433013,0.288675,1.0,0.0,0.144338,0.235702,0.288675,0.133631,0.433013,0.117851,0.144338,0.0,0.091287,0.144338,0.144338,0.196116,0.589256,0.267261
436c7907-3ded-4313-832a-831fd3259848,0.0,0.272166,0.0,0.136083,0.0,1.0,0.136083,0.333333,0.0,0.503953,0.0,0.222222,0.408248,0.0,0.344265,0.0,0.544331,0.3698,0.111111,0.125988
449bc0ab-6942-426e-b573-9cb690ffd1b9,0.166667,0.166667,0.333333,0.166667,0.144338,0.136083,1.0,0.136083,0.166667,0.154303,0.0,0.136083,0.0,0.0,0.0,0.0,0.0,0.113228,0.0,0.0
45365d21-9977-4505-a0f0-7f090bc33747,0.0,0.408248,0.136083,0.0,0.235702,0.333333,0.136083,1.0,0.272166,0.125988,0.0,0.333333,0.136083,0.136083,0.344265,0.136083,0.272166,0.3698,0.111111,0.377964
740f8541-0638-4348-8fd9-72453613be4e,0.0,0.0,0.333333,0.333333,0.288675,0.0,0.166667,0.272166,1.0,0.308607,0.0,0.136083,0.0,0.166667,0.210819,0.166667,0.333333,0.339683,0.272166,0.154303
76e52c42-0d73-42b7-95c4-20aadcd886ae,0.0,0.0,0.308607,0.308607,0.133631,0.503953,0.154303,0.125988,0.308607,1.0,0.0,0.377964,0.308607,0.154303,0.29277,0.154303,0.617213,0.314485,0.125988,0.142857


In [0]:
merged_user_interests = pd.merge(users_df, user_interest_df, left_on='id', right_on='user_id')
merged_user_interests = merged_user_interests.drop(['id', "has_interest", "image", "created_at", "email"], axis=1)
merged_user_interests

Unnamed: 0,name,description,user_id,interest_id
0,Sierra Trailson,Weekend wanderer and sunrise seeker. I hike to...,897a4d99-a0d4-474a-b34d-78ab1e062a93,06b52f51-32a3-4921-948c-23cb1f474ca3
1,Sierra Trailson,Weekend wanderer and sunrise seeker. I hike to...,897a4d99-a0d4-474a-b34d-78ab1e062a93,0f790fe0-7859-4408-9b98-c8cee119d659
2,Sierra Trailson,Weekend wanderer and sunrise seeker. I hike to...,897a4d99-a0d4-474a-b34d-78ab1e062a93,1820882b-7561-4f92-9f8d-8c241be21cf6
3,Sierra Trailson,Weekend wanderer and sunrise seeker. I hike to...,897a4d99-a0d4-474a-b34d-78ab1e062a93,1f85d55e-0b0c-4dd7-a9b0-82e4cb699145
4,Sierra Trailson,Weekend wanderer and sunrise seeker. I hike to...,897a4d99-a0d4-474a-b34d-78ab1e062a93,40ddb76d-be61-4b45-9ec1-1d917e3a08c0
...,...,...,...,...
147,Samantha Waters,Im into doing outdoor activities!,355e5f7c-87fa-45c4-a218-ff904b7a1128,6c22dd13-897c-4b98-95d0-32dda931a2d3
148,Samantha Waters,Im into doing outdoor activities!,355e5f7c-87fa-45c4-a218-ff904b7a1128,93c2f1f1-c92f-4a33-af8e-669b40d1b34e
149,Samantha Waters,Im into doing outdoor activities!,355e5f7c-87fa-45c4-a218-ff904b7a1128,98df11b0-8fe1-481f-b6e3-464a9b8c141a
150,Samantha Waters,Im into doing outdoor activities!,355e5f7c-87fa-45c4-a218-ff904b7a1128,baa13863-6091-4872-ba3d-b29a09af0d05


In [0]:
grouped_df = merged_user_interests.groupby("user_id")['interest_id'].apply(list).reset_index()
grouped_df

Unnamed: 0,user_id,interest_id
0,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,"[15bb36f9-7466-4e83-a148-5096114cce9e, 62b8624..."
1,0b7249ca-cc82-4ff1-9d50-d8d61280b717,"[17a74d46-88b0-40a8-afa8-bbca0d79d285, 1888b45..."
2,355e5f7c-87fa-45c4-a218-ff904b7a1128,"[4b57788f-1d57-4286-b760-14e6fbccf2f6, 6c22dd1..."
3,3e6acdeb-aa9a-47f1-879d-d7139eb98e2f,"[06b52f51-32a3-4921-948c-23cb1f474ca3, 15bb36f..."
4,3fb510a5-58a0-4994-81ba-29e69fa79db1,"[6117a58e-f12f-4af1-b031-da9fa32d0742, 6c22dd1..."
5,436c7907-3ded-4313-832a-831fd3259848,"[06b52f51-32a3-4921-948c-23cb1f474ca3, 0f790fe..."
6,449bc0ab-6942-426e-b573-9cb690ffd1b9,"[1820882b-7561-4f92-9f8d-8c241be21cf6, 4388845..."
7,45365d21-9977-4505-a0f0-7f090bc33747,"[1888b454-b69e-46bf-bc92-24e6a1085518, 40ddb76..."
8,740f8541-0638-4348-8fd9-72453613be4e,"[72a885d5-db6e-4148-9d2a-09347d948451, 86b6447..."
9,76e52c42-0d73-42b7-95c4-20aadcd886ae,"[06b52f51-32a3-4921-948c-23cb1f474ca3, 0f790fe..."


In [0]:
# interest-interest recommender here
"""The logic behind adding the following content-based method to generated embeddings based on short descriptions of each of the interests and then compare the similarity of the interests using those embeddings. These results can simply be used in the 'similarity_score' attribute that's in the interest_recommendations_table or can be used to augment the number of interests recommended if there's ever a shortage of user-data"""

interest_definitions = {
    "math": "The study of numbers, quantities, patterns, and relationships using abstract reasoning.",
    "pottery": "The art of shaping and firing clay to create functional or decorative objects.",
    "knitting": "A craft involving interlocking loops of yarn with needles to create fabrics or garments.",
    "yoga": "A practice combining physical postures, breathing techniques, and meditation to promote well-being.",
    "rollerskating": "The activity of gliding on roller skates for recreation or sport on smooth surfaces.",
    "skiing": "A winter sport where individuals slide over snow on skis, either downhill or cross-country.",
    "playing instruments": "The practice of producing music by performing on various musical instruments.",
    "cardio": "Exercises that raise the heart rate to improve cardiovascular endurance and overall fitness.",
    "studying": "The process of acquiring knowledge or skills through reading, practice, or instruction.",
    "sewing": "The craft of stitching fabrics together with a needle and thread to create or repair textiles.",
    "concerts": "Live music performances where one or more musicians entertain an audience.",
    "crocheting": "A handicraft using a hooked needle to loop yarn into fabrics or decorative items.",
    "fishing": "The activity of catching fish for sport, recreation, or food using various techniques.",
    "snowboarding": "A winter sport in which one rides down slopes on a board attached to their feet.",
    "baseball": "A bat-and-ball team sport involving pitching, hitting, and fielding on a diamond-shaped field.",
    "pilates": "An exercise regimen focused on core strength, flexibility, and overall body alignment.",
    "vegetarian": "A dietary lifestyle that excludes meat, emphasizing fruits, vegetables, and plant-based foods.",
    "volleyball": "A team sport in which players hit a ball over a net aiming to ground it on the opponent’s court.",
    "boating": "The activity of operating or riding in a boat for leisure, sport, or transportation.",
    "desserts and pastry": "The culinary art of creating sweet baked goods and confections.",
    "drawing": "The art of creating images on a surface using pencils, pens, or other drawing tools.",
    "science": "The systematic study of the natural world through observation, experimentation, and analysis.",
    "skating": "The activity of gliding on surfaces using ice skates or roller skates for leisure or sport.",
    "car shows": "Events or exhibitions where a variety of cars are displayed, often highlighting design and performance.",
    "painting": "The practice of applying pigments to a surface, such as canvas, to create artistic compositions.",
    "motorbikes": "The hobby or sport involving riding, maintaining, or customizing two-wheeled motorized vehicles.",
    "history": "The study of past events and human experiences, focusing on understanding change over time.",
    "meditation": "A practice of focused attention and mindfulness aimed at achieving mental clarity and calm.",
    "trains": "Railway vehicles or systems used for transporting people and goods, often admired by enthusiasts.",
    "self-improvement": "Activities aimed at enhancing personal skills, habits, or lifestyle for overall growth.",
    "vegan": "A lifestyle that avoids all animal products, emphasizing a plant-based diet and ethical choices.",
    "basketball": "A team sport where players attempt to score points by shooting a ball through a raised hoop.",
    "lifting": "Usually referring to weightlifting or resistance training aimed at building strength and muscle.",
    "reading": "The act of interpreting written or printed material for education, pleasure, or information.",
    "racing": "Competitive events in which individuals or vehicles compete for the fastest time or highest speed.",
    "culture": "The collective customs, traditions, arts, and social practices of a group or society.",
    "camping": "An outdoor recreational activity that involves spending time overnight in nature, typically in tents.",
    "philosophy": "The study of fundamental questions about existence, knowledge, values, and reasoning.",
    "nutrition": "The study of food, nutrients, and their effects on health, along with dietary practices.",
    "tutoring": "Providing personalized instruction or guidance to help someone better understand a subject.",
    "hiking": "A recreational activity involving long walks in natural environments, typically on trails.",
    "poetry": "A form of literary art that expresses ideas and emotions through imaginative use of language and structure.",
    "sculpting": "The art of creating three-dimensional works by carving, modeling, or assembling materials.",
    "international cuisine": "The exploration and preparation of dishes from diverse cultures around the world.",
    "meal prep": "The practice of planning and preparing meals in advance to ensure a consistent, healthy diet.",
    "software": "Programs and applications that instruct a computer on how to perform specific tasks.",
    "hardware": "The physical components of computers or electronic devices that facilitate their function."
}


In [0]:
# validate the data (at least visually)
user_interest_df_merge = user_interest_df.copy()

merged_df = user_interest_df_merge.merge(
    interests_df,
    left_on="interest_id",
    right_on="id",
    how='inner'
)
merged_df

Unnamed: 0,user_id,interest_id,has_interest,id,name,category_id
0,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,15bb36f9-7466-4e83-a148-5096114cce9e,1,15bb36f9-7466-4e83-a148-5096114cce9e,Yoga,3e50575e-4896-4cf1-a98b-35f629474335
1,3e6acdeb-aa9a-47f1-879d-d7139eb98e2f,15bb36f9-7466-4e83-a148-5096114cce9e,1,15bb36f9-7466-4e83-a148-5096114cce9e,Yoga,3e50575e-4896-4cf1-a98b-35f629474335
2,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,62b86242-7014-499f-9cb9-42cfca036de9,1,62b86242-7014-499f-9cb9-42cfca036de9,Pilates,3e50575e-4896-4cf1-a98b-35f629474335
3,449bc0ab-6942-426e-b573-9cb690ffd1b9,62b86242-7014-499f-9cb9-42cfca036de9,1,62b86242-7014-499f-9cb9-42cfca036de9,Pilates,3e50575e-4896-4cf1-a98b-35f629474335
4,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,6663208a-593c-4c58-8df2-1cf2fda3cce6,1,6663208a-593c-4c58-8df2-1cf2fda3cce6,Vegetarian,9d9196f4-d3e1-436b-91f7-596393dcc94e
...,...,...,...,...,...,...
147,ec03cce8-e743-45af-97e9-71bc05588376,9b9d4d70-dec0-46a9-b7ef-47a773735861,1,9b9d4d70-dec0-46a9-b7ef-47a773735861,Vegan,9d9196f4-d3e1-436b-91f7-596393dcc94e
148,a99e13b8-fc64-48c6-a502-72e91972c107,03f7dc13-cd91-4eaf-95c2-59dc783c8e03,1,03f7dc13-cd91-4eaf-95c2-59dc783c8e03,Math,324b188f-361a-416d-9f58-83b9a2e8606c
149,c2c9fd5e-2040-4ee7-92ec-4bd65a467a27,7ee3f21b-2e73-4ddd-b6ab-0ae7d7f60a5c,1,7ee3f21b-2e73-4ddd-b6ab-0ae7d7f60a5c,Science,324b188f-361a-416d-9f58-83b9a2e8606c
150,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,7ee3f21b-2e73-4ddd-b6ab-0ae7d7f60a5c,1,7ee3f21b-2e73-4ddd-b6ab-0ae7d7f60a5c,Science,324b188f-361a-416d-9f58-83b9a2e8606c


In [0]:
user_interest_names_df = merged_df.groupby("user_id")['name'].apply(list).reset_index()
user_interest_names_df

Unnamed: 0,user_id,name
0,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,"[Yoga, Pilates, Vegetarian, Car Shows, Trains,..."
1,0b7249ca-cc82-4ff1-9d50-d8d61280b717,"[Rollerskating, Playing Instruments, Sewing, C..."
2,355e5f7c-87fa-45c4-a218-ff904b7a1128,"[Fishing, Boating, Meditation, Self-Improvemen..."
3,3e6acdeb-aa9a-47f1-879d-d7139eb98e2f,"[Yoga, Boating, Self-Improvement, Pottery, Scu..."
4,3fb510a5-58a0-4994-81ba-29e69fa79db1,"[Car Shows, Trains, Boating, Self-Improvement,..."
5,436c7907-3ded-4313-832a-831fd3259848,"[Concerts, Reading, Pottery, Knitting, Drawing..."
6,449bc0ab-6942-426e-b573-9cb690ffd1b9,"[Pilates, Concerts, Meditation, Self-Improveme..."
7,45365d21-9977-4505-a0f0-7f090bc33747,"[Playing Instruments, Sewing, Concerts, Hiking..."
8,740f8541-0638-4348-8fd9-72453613be4e,"[Self-Improvement, Hiking, International Cuisi..."
9,76e52c42-0d73-42b7-95c4-20aadcd886ae,"[Meditation, Hiking, Pottery, International Cu..."


In [0]:
user_id = "02048efa-1c9c-4ec3-a3b3-00b83b683c98"
spec_user_interests = user_interest_names_df.loc[0]['name']
spec_user_interests = [item.lower() for item in spec_user_interests] # converting to lowercase to match format
spec_user_interests

['yoga', 'pilates', 'vegetarian', 'car shows', 'trains', 'culture']

In [0]:
# from sentence_transformers import SentenceTransformer # takes like 16 seconds
# import numpy as np


In [0]:
# model = SentenceTransformer('all-MiniLM-L6-v2') # takes 4 sec
# by default seems to be dimension 384 I think?

In [0]:
candidate_interests = [interest for interest in interest_definitions.keys() if interest not in spec_user_interests]
candidate_interests

['math',
 'pottery',
 'knitting',
 'rollerskating',
 'skiing',
 'playing instruments',
 'cardio',
 'studying',
 'sewing',
 'concerts',
 'crocheting',
 'fishing',
 'snowboarding',
 'baseball',
 'volleyball',
 'boating',
 'desserts and pastry',
 'drawing',
 'science',
 'skating',
 'painting',
 'motorbikes',
 'history',
 'meditation',
 'self-improvement',
 'vegan',
 'basketball',
 'lifting',
 'reading',
 'racing',
 'camping',
 'philosophy',
 'nutrition',
 'tutoring',
 'hiking',
 'poetry',
 'sculpting',
 'international cuisine',
 'meal prep',
 'software',
 'hardware']

In [0]:
user_descriptions = [interest_definitions[interest] for interest in spec_user_interests]
user_descriptions

['A practice combining physical postures, breathing techniques, and meditation to promote well-being.',
 'An exercise regimen focused on core strength, flexibility, and overall body alignment.',
 'A dietary lifestyle that excludes meat, emphasizing fruits, vegetables, and plant-based foods.',
 'Events or exhibitions where a variety of cars are displayed, often highlighting design and performance.',
 'Railway vehicles or systems used for transporting people and goods, often admired by enthusiasts.',
 'The collective customs, traditions, arts, and social practices of a group or society.']

In [0]:
candidate_descriptions = [interest_definitions[interest] for interest in candidate_interests]
candidate_descriptions

['The study of numbers, quantities, patterns, and relationships using abstract reasoning.',
 'The art of shaping and firing clay to create functional or decorative objects.',
 'A craft involving interlocking loops of yarn with needles to create fabrics or garments.',
 'The activity of gliding on roller skates for recreation or sport on smooth surfaces.',
 'A winter sport where individuals slide over snow on skis, either downhill or cross-country.',
 'The practice of producing music by performing on various musical instruments.',
 'Exercises that raise the heart rate to improve cardiovascular endurance and overall fitness.',
 'The process of acquiring knowledge or skills through reading, practice, or instruction.',
 'The craft of stitching fabrics together with a needle and thread to create or repair textiles.',
 'Live music performances where one or more musicians entertain an audience.',
 'A handicraft using a hooked needle to loop yarn into fabrics or decorative items.',
 'The activi

In [0]:
# user_embeddings = model.encode(user_descriptions) # both of these took like .5 sec
# candidate_embeddings = model.encode(candidate_descriptions)

In [0]:
# # Mean Pooling 

# user_centroid =np.mean(user_embeddings,axis=0).reshape(1, -1) # here we're averaging the embeddings created from just the user's interests to create single vector, which will then be compared against the possible candidate embeddings
# similarities_mean = cosine_similarity(user_centroid, candidate_embeddings).flatten()
# recommendations_mean = sorted(zip(candidate_interests, similarities_mean), key=lambda x: x[1], reverse=True)

# print("Recommendations using Mean Pooling (Averaging):")
# for interest, score in recommendations_mean:
#     print(f"{interest}: {score:.2f}")

Recommendations using Mean Pooling (Averaging):
self-improvement: 0.60
hiking: 0.54
motorbikes: 0.53
vegan: 0.48
boating: 0.48
nutrition: 0.46
camping: 0.41
lifting: 0.40
playing instruments: 0.39
baseball: 0.39
racing: 0.38
cardio: 0.38
skating: 0.37
fishing: 0.37
meditation: 0.36
rollerskating: 0.35
software: 0.34
meal prep: 0.34
poetry: 0.33
tutoring: 0.32
studying: 0.32
skiing: 0.31
international cuisine: 0.31
snowboarding: 0.30
concerts: 0.30
hardware: 0.29
science: 0.28
volleyball: 0.27
reading: 0.27
history: 0.26
basketball: 0.26
pottery: 0.25
sewing: 0.25
sculpting: 0.24
knitting: 0.24
math: 0.23
desserts and pastry: 0.21
crocheting: 0.21
painting: 0.20
drawing: 0.20
philosophy: 0.17


In [0]:
# # Maximum Similarity Aggregation

# # Compute pairwise cosine similarity between each user's interest embedding and each candidate
# all_similarities = cosine_similarity(user_embeddings, candidate_embeddings)

# # For each candidate interest, take the maximum similarity across all user interests
# max_similarities = all_similarities.max(axis=0)

# recommendations_max = sorted(zip(candidate_interests, max_similarities), key=lambda x: x[1], reverse=True)

# print("\nRecommendations using Maximum Similarity Aggregation:")
# for interest, score in recommendations_max:
#     print(f"{interest}: {score:.2f}")


Recommendations using Maximum Similarity Aggregation:
vegan: 0.79
nutrition: 0.65
meditation: 0.54
racing: 0.53
lifting: 0.50
cardio: 0.48
meal prep: 0.47
motorbikes: 0.46
self-improvement: 0.46
boating: 0.44
playing instruments: 0.38
hiking: 0.38
poetry: 0.36
international cuisine: 0.36
fishing: 0.33
concerts: 0.33
camping: 0.32
reading: 0.30
baseball: 0.30
studying: 0.30
snowboarding: 0.29
skating: 0.28
desserts and pastry: 0.28
rollerskating: 0.28
tutoring: 0.27
hardware: 0.26
painting: 0.26
software: 0.26
volleyball: 0.26
pottery: 0.25
sewing: 0.25
basketball: 0.25
sculpting: 0.24
drawing: 0.24
skiing: 0.24
crocheting: 0.23
science: 0.23
knitting: 0.21
history: 0.21
philosophy: 0.20
math: 0.19


In [0]:
from datetime import datetime

num_recommendations_for_user = 20

interest_recommendations = []
existing_recommendations = set()


for user in similarity_df.index:

# for idx, row in grouped_df.iterrows():
#     user = row['user_id']
#     user_interests = row['interest_id']

    # print(f"user {user}")
    user_similarities = similarity_df.loc[user]

    user_similarities = user_similarities[user_similarities.index != user] # removing the user_id in question from results
    user_similarities = user_similarities[user_similarities > 0] # removing zero values
    user_similarities = user_similarities[user_similarities != 1] # just incase there's ever profiles that are exactly the same
    count = 0

    # sort by descending similarity
    user_similarities = user_similarities.sort_values(ascending=False) 
    for similar_user, sim_score in user_similarities.items():
        
        # I'm exluding answers from users who have a non-zero similarity_score but who are, really, not that similar
        # at 200-300 score, the users only have 2-3 interests in common
        # I might up this threshold once I get a interest-interest recommender up and going 

        similar_user_interests = grouped_df.loc[grouped_df['user_id'] == similar_user, "interest_id"].tolist()[0]
        user_interests = grouped_df.loc[grouped_df['user_id'] == user, "interest_id"].tolist()[0]
     

        difference = [item for item in similar_user_interests if item not in user_interests]

        for different_interest in difference:
            # maybe include an element of randomness? not sure -- efficiency wise it would be best just to put all interests that are different (from similar users)
            if (user, different_interest) not in existing_recommendations:

                interest_recommendations.append({
                    "user_id": user,
                    "recommended_interest_id": different_interest,
                    "similarity_score": 1, # might calculate this based on the similarity score of the user from whom it's taken? what about we take the embedding calculation here
                    "has_been_recomended": True,
                    "timestamp": datetime.utcnow().isoformat(), # converting to iso to make it serializable for JSON
                    "embedding": None, 
                    })
                existing_recommendations.add((user, different_interest))
                count += 1

            if count >= num_recommendations_for_user:
                break


interest_recommendations_df = pd.DataFrame(interest_recommendations)
interest_recommendations_df

Unnamed: 0,user_id,recommended_interest_id,similarity_score,has_been_recomended,timestamp,embedding
0,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,6117a58e-f12f-4af1-b031-da9fa32d0742,1,True,2025-04-16T20:45:02.267934,
1,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,6c22dd13-897c-4b98-95d0-32dda931a2d3,1,True,2025-04-16T20:45:02.267959,
2,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,88d66a6d-8bdb-413e-8a21-a2012c737f16,1,True,2025-04-16T20:45:02.267969,
3,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,98df11b0-8fe1-481f-b6e3-464a9b8c141a,1,True,2025-04-16T20:45:02.267977,
4,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,ba47674f-e078-4769-85d8-8e46e20d39c8,1,True,2025-04-16T20:45:02.267986,
...,...,...,...,...,...,...
437,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,6663208a-593c-4c58-8df2-1cf2fda3cce6,1,True,2025-04-16T20:45:03.358823,
438,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,7e21b3d7-9a17-4e2a-aa69-a4feba8e7c84,1,True,2025-04-16T20:45:03.359220,
439,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,86b64472-50ea-4d2e-b062-69024291cad4,1,True,2025-04-16T20:45:03.359231,
440,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,0f790fe0-7859-4408-9b98-c8cee119d659,1,True,2025-04-16T20:45:03.359796,


In [0]:
interest_recommendations_df

Unnamed: 0,user_id,recommended_interest_id,similarity_score,has_been_recomended,timestamp,embedding
0,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,6117a58e-f12f-4af1-b031-da9fa32d0742,1,True,2025-04-16T20:45:02.267934,
1,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,6c22dd13-897c-4b98-95d0-32dda931a2d3,1,True,2025-04-16T20:45:02.267959,
2,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,88d66a6d-8bdb-413e-8a21-a2012c737f16,1,True,2025-04-16T20:45:02.267969,
3,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,98df11b0-8fe1-481f-b6e3-464a9b8c141a,1,True,2025-04-16T20:45:02.267977,
4,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,ba47674f-e078-4769-85d8-8e46e20d39c8,1,True,2025-04-16T20:45:02.267986,
...,...,...,...,...,...,...
437,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,6663208a-593c-4c58-8df2-1cf2fda3cce6,1,True,2025-04-16T20:45:03.358823,
438,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,7e21b3d7-9a17-4e2a-aa69-a4feba8e7c84,1,True,2025-04-16T20:45:03.359220,
439,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,86b64472-50ea-4d2e-b062-69024291cad4,1,True,2025-04-16T20:45:03.359231,
440,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,0f790fe0-7859-4408-9b98-c8cee119d659,1,True,2025-04-16T20:45:03.359796,


In [0]:

# delete all user_interest_recommendations every 7 days? 14 days? At some set interval? 

old_user_interest_recs_df = pull_full_table("user_interest_recommendations")
old_user_interest_recs_df


Total Rows in Table: 205
Num Batches to get Total Rows 1
For batch: 1, start: 0, end: 205


Unnamed: 0,id,user_id,recommended_interest_id,similarity_score,has_been_recomended,timestamp,embedding
0,7f985db6-f926-460a-8ccc-8f0e56ade608,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,1820882b-7561-4f92-9f8d-8c241be21cf6,1,True,2025-04-16T20:45:02.270091,
1,7c50e214-4e1e-45e8-9ab3-8e8675b109b4,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,5bf4236f-11ed-4332-8bb5-c0cc009adc99,1,True,2025-04-16T20:45:02.270114,
2,790ce3ea-edd8-4474-b717-02cba7125d2f,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,93c2f1f1-c92f-4a33-af8e-669b40d1b34e,1,True,2025-04-16T20:45:02.270123,
3,79846d95-2623-4289-9206-972a54cac142,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,fea6ed7d-9c6b-44ac-9e8e-198ca9a681a2,1,True,2025-04-16T20:45:02.270708,
4,e2327c94-987c-4f66-9b2a-60f85fc7fa5f,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,ff459b3e-f2aa-4057-b7c2-8a8ddf98bd9d,1,True,2025-04-16T20:45:02.273242,
...,...,...,...,...,...,...,...
200,1eed5709-07c2-4b4c-80a5-1b5cd371fe6e,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,6663208a-593c-4c58-8df2-1cf2fda3cce6,1,True,2025-04-16T20:45:03.358823,
201,c9bb4bf5-3581-4ec1-888f-f6be802e12dd,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,7e21b3d7-9a17-4e2a-aa69-a4feba8e7c84,1,True,2025-04-16T20:45:03.35922,
202,9902ddd5-6808-4414-acc4-f6806b07dc45,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,86b64472-50ea-4d2e-b062-69024291cad4,1,True,2025-04-16T20:45:03.359231,
203,c153dc8a-da84-45f0-976f-8df4ac6779a9,f8f9ec17-fb66-4689-abc2-529d61dfb1f4,0f790fe0-7859-4408-9b98-c8cee119d659,1,True,2025-04-16T20:45:03.359796,


In [0]:
# compare the output of old recommendations with new recommendations and remove duplicates 

# unrealistic threshold to prevent accidental deletion
day_threshold = 100

# if oldest date found in old_user_user_recommendations has a 7 day difference between it and utcnow()
if check_oldest_timestamp(old_user_interest_recs_df, day_threshold=day_threshold):
    # uncomment when live
    delete_table("user_interest_recommendations")
else:
    print("removing duplicates from new batch of recommendations_df")
    filtered_recommendations_df = remove_duplicate_interest_recommendations(interest_recommendations_df, old_user_interest_recs_df)
    filtered_recommendations_df

old_recs_df is not older than day threshold set, it should be kept
removing duplicates from new batch of recommendations_df


In [0]:
filtered_recommendations_df

Unnamed: 0,user_id,recommended_interest_id,similarity_score,has_been_recomended,timestamp,embedding
0,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,6117a58e-f12f-4af1-b031-da9fa32d0742,1,True,2025-04-16T20:45:02.267934,
1,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,6c22dd13-897c-4b98-95d0-32dda931a2d3,1,True,2025-04-16T20:45:02.267959,
2,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,88d66a6d-8bdb-413e-8a21-a2012c737f16,1,True,2025-04-16T20:45:02.267969,
3,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,98df11b0-8fe1-481f-b6e3-464a9b8c141a,1,True,2025-04-16T20:45:02.267977,
4,02c3c862-b8a5-4fde-ac56-e3b5545f18d6,ba47674f-e078-4769-85d8-8e46e20d39c8,1,True,2025-04-16T20:45:02.267986,
...,...,...,...,...,...,...
232,ecb95c11-4923-42d0-9a45-01eb27cb832a,62b86242-7014-499f-9cb9-42cfca036de9,1,True,2025-04-16T20:45:03.230850,
233,ecb95c11-4923-42d0-9a45-01eb27cb832a,6663208a-593c-4c58-8df2-1cf2fda3cce6,1,True,2025-04-16T20:45:03.230860,
234,ecb95c11-4923-42d0-9a45-01eb27cb832a,ba5bc1b4-501c-4fbd-899c-63375b9b5ca3,1,True,2025-04-16T20:45:03.230869,
235,ecb95c11-4923-42d0-9a45-01eb27cb832a,4b57788f-1d57-4286-b760-14e6fbccf2f6,1,True,2025-04-16T20:45:03.231372,


In [0]:
# remove recommendations based on various criteria 

# AGE


# LOCATION



In [0]:
records = filtered_recommendations_df.to_dict(orient='records')
records


[{'user_id': '02c3c862-b8a5-4fde-ac56-e3b5545f18d6',
  'recommended_interest_id': '6117a58e-f12f-4af1-b031-da9fa32d0742',
  'similarity_score': 1,
  'has_been_recomended': True,
  'timestamp': '2025-04-16T20:45:02.267934',
  'embedding': None},
 {'user_id': '02c3c862-b8a5-4fde-ac56-e3b5545f18d6',
  'recommended_interest_id': '6c22dd13-897c-4b98-95d0-32dda931a2d3',
  'similarity_score': 1,
  'has_been_recomended': True,
  'timestamp': '2025-04-16T20:45:02.267959',
  'embedding': None},
 {'user_id': '02c3c862-b8a5-4fde-ac56-e3b5545f18d6',
  'recommended_interest_id': '88d66a6d-8bdb-413e-8a21-a2012c737f16',
  'similarity_score': 1,
  'has_been_recomended': True,
  'timestamp': '2025-04-16T20:45:02.267969',
  'embedding': None},
 {'user_id': '02c3c862-b8a5-4fde-ac56-e3b5545f18d6',
  'recommended_interest_id': '98df11b0-8fe1-481f-b6e3-464a9b8c141a',
  'similarity_score': 1,
  'has_been_recomended': True,
  'timestamp': '2025-04-16T20:45:02.267977',
  'embedding': None},
 {'user_id': '02c3c8

In [0]:
# don't uncomment this out until you've figured out how to remove duplicates
response = supabase.table("user_interest_recommendations").insert(records).execute()

In [0]:
user_interest_matrix = user_interest_matrix_unprocessed.copy()

# remapping the user_id to be n_users - 1 (to work with sparse matrix) 
user_interest_matrix.index = user_interest_matrix.index.map(lambda uid: new_user_id_map[uid])
user_interest_matrix.index.name = None
user_interest_matrix.index.name = "user_id"
user_interest_matrix

In [0]:
# remapping interest_id to be n_interests to work with sparse matrix 
user_interest_matrix.rename(columns=new_interest_id_map, inplace=True)
user_interest_matrix

In [0]:
# stuff still to do 
#1. TODO Error handling incase a user is deleted mid-process? Not sure how to handle that