In [67]:
import pandas as pd
import numpy as np
import clip
import torch
import requests
from io import BytesIO
from PIL import Image
from deepface import DeepFace
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv


In [40]:
COMPILED_OUTPUT = r'../outputs/compiled_output.csv'
CUSTOM_PREDICTIONS = r'../outputs/gender_predictions.csv'

compiled_df = pd.read_csv(COMPILED_OUTPUT)
custom_prediction_df = pd.read_csv(CUSTOM_PREDICTIONS)

compiled_df_valid = compiled_df[compiled_df['gender']!='unknown']
compiled_df_valid['predicted_by'] = 'pretrained_deepface'
drop_cols = ['source_file']
drop_cols = [col for col in compiled_df_valid.columns if col in drop_cols]
compiled_df_valid = compiled_df_valid.drop(columns=drop_cols)

In [41]:
compiled_df_valid.columns

Index(['username', 'gender', 'confidence_score', 'avatar', 'predicted_by'], dtype='object')

In [42]:
drop_cols = [col for col in custom_prediction_df if col.startswith('Unnamed')]
drop_cols = drop_cols + ['source_file','gender','new_gender','confidence_score']
drop_cols = [col for col in custom_prediction_df.columns if col in drop_cols]
custom_prediction_df = custom_prediction_df.drop(columns=drop_cols)
custom_prediction_df = custom_prediction_df.rename(columns = {
    'predicted_gender': 'gender',
    'confidence': 'confidence_score'
})

In [43]:
custom_prediction_df['confidence_score'] = custom_prediction_df.apply(
    lambda row: (
        {'Woman': round(row['confidence_score'], 2) * 100, 'Man': round((1 - row['confidence_score']), 2) * 100} if row['gender'].lower() == 'woman'
        else {'Woman': round((1 - row['confidence_score']), 2) * 100, 'Man': round(row['confidence_score'], 2) * 100} if row['gender'].lower() == 'man'
        else {'Woman': 0.0, 'Man': 0.0}
    ),
    axis=1
)


In [44]:
custom_prediction_df['predicted_by'] = 'custom_model_finetuned'

In [45]:
compiled_df_valid.columns, custom_prediction_df.columns

(Index(['username', 'gender', 'confidence_score', 'avatar', 'predicted_by'], dtype='object'),
 Index(['username', 'avatar', 'gender', 'confidence_score', 'predicted_by'], dtype='object'))

In [46]:
combined_df = pd.concat([compiled_df_valid, custom_prediction_df], ignore_index=True)


In [47]:
len(combined_df)

40261

In [48]:
len(compiled_df_valid),len(custom_prediction_df)

(19336, 20925)

In [49]:
##### EXTRACTING THE EMBEDDINGS AND ATTRIBUTES #####
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
llm_pipeline = pipeline("text-generation", model="gpt2")  # Replace with OpenAI API if preferred


Device set to use mps:0


In [50]:
def get_face_embedding(image_path):
    try:
        embedding = DeepFace.represent(img_path=image_path, model_name='Facenet')[0]['embedding']
        return embedding
    except Exception as e:
        print(f"[Face] No face found: {e}")
        return None

In [51]:
def get_clip_image_embedding(image_path):
    try:
        response = requests.get(image_path, timeout=10)
        response.raise_for_status()  # Raise error for bad status
        image = Image.open(BytesIO(response.content)).convert("RGB")
        image_input = clip_preprocess(image).unsqueeze(0)

        with torch.no_grad():
            embedding = clip_model.encode_image(image_input).squeeze().numpy()
        return embedding
    except Exception as e:
        print(f"Failed to get embedding for URL {image_path}: {e}")
        return None
    
def get_clip_text_embedding(text):
    text_input = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_features = clip_model.encode_text(text_input).squeeze().cpu().numpy()
    return text_features

In [73]:
def recommend_similar_images(query_text=None, query_image_url=None, top_k=5):
    if not query_text and not query_image_url:
        raise ValueError("At least one of 'query_text' or 'query_image_url' must be provided.")

    query_embeddings = []

    if query_text:
        text_embedding = get_clip_text_embedding(query_text)
        query_embeddings.append(text_embedding)

    if query_image_url:
        image_embedding = get_clip_image_embedding(query_image_url)
        if image_embedding is not None:
            query_embeddings.append(image_embedding)

    if not query_embeddings:
        return []

    query_vector = np.mean(query_embeddings, axis=0).reshape(1, -1)

    # Fetch data from Supabase
    response = supabase.table("of_profiles").select("*").execute()
    records = response.data

    # Convert to DataFrame
    df = pd.DataFrame(records)

    # Drop rows with missing or invalid embeddings
    df = df[df['embeddings'].notna()]

    # Convert JSON embeddings to np.array
    df['embedding_array'] = df['embeddings'].apply(lambda x: np.array(x, dtype=np.float32))

    # Create matrix for cosine similarity
    image_features_matrix = np.vstack(df['embedding_array'].values.tolist())

    similarity_scores = cosine_similarity(query_vector, image_features_matrix)[0]
    top_indices = np.argsort(similarity_scores)[::-1][:top_k]

    return df.iloc[top_indices].assign(similarity_score=similarity_scores[top_indices])


In [None]:
compiled_df_valid['embeddings'] = ''
for idx, row in compiled_df_valid.iterrows():
    try:
        embedding = get_clip_image_embedding(row.avatar)
        compiled_df_valid.at[idx, 'embeddings'] = embedding
    except:
        print(f"Skipping {row.username} - no face or bad image")
        compiled_df_valid.at[idx, 'embeddings'] = None


In [74]:
# Text-only search
recommend_similar_images(query_text="tatoo", top_k=3)

Unnamed: 0,id,created_at,username,avatar,gender,confidence_score,predicted_by,embeddings,embedding_array,similarity_score
2,3,2025-05-18T04:30:25.705461+00:00,777p3ach,https://public.onlyfans.com/files/7/73/73d/73d...,woman,"{'Woman': 50.043076276779175, 'Man': 49.956917...",pretrained_deepface,"[0.026222987100481987, 0.03928045928478241, 0....","[0.026222987, 0.03928046, 0.030166619, -0.2567...",0.243854
0,1,2025-05-18T04:30:24.258725+00:00,19hislittlegoth91,https://public.onlyfans.com/files/b/bx/bxd/bxd...,man,"{'Woman': 11.30981296300888, 'Man': 88.6901855...",pretrained_deepface,"[-0.06269136816263199, 0.05155433341860771, -0...","[-0.06269137, 0.051554333, -0.30387303, 0.2713...",0.239834
4,5,2025-05-18T04:30:26.658133+00:00,a1day1princess,https://public.onlyfans.com/files/w/we/wes/wes...,man,"{'Woman': 4.30479571223259, 'Man': 95.69520354...",pretrained_deepface,"[0.23092104494571686, -0.027042806148529053, 0...","[0.23092104, -0.027042806, 0.007916119, -0.209...",0.230642


In [75]:
# # Image-only search
recommend_similar_images(query_image_url="https://public.onlyfans.com/files/v/vk/vk5/vk5bd4pkhuxw0bgeklvlpc5yitjrwz211586828835/avatar.jpg", top_k=3)

Unnamed: 0,id,created_at,username,avatar,gender,confidence_score,predicted_by,embeddings,embedding_array,similarity_score
0,1,2025-05-18T04:30:24.258725+00:00,19hislittlegoth91,https://public.onlyfans.com/files/b/bx/bxd/bxd...,man,"{'Woman': 11.30981296300888, 'Man': 88.6901855...",pretrained_deepface,"[-0.06269136816263199, 0.05155433341860771, -0...","[-0.06269137, 0.051554333, -0.30387303, 0.2713...",0.759878
2,3,2025-05-18T04:30:25.705461+00:00,777p3ach,https://public.onlyfans.com/files/7/73/73d/73d...,woman,"{'Woman': 50.043076276779175, 'Man': 49.956917...",pretrained_deepface,"[0.026222987100481987, 0.03928045928478241, 0....","[0.026222987, 0.03928046, 0.030166619, -0.2567...",0.733666
1,2,2025-05-18T04:30:24.985493+00:00,2troubledouble,https://public.onlyfans.com/files/t/t2/t2f/t2f...,man,"{'Woman': 11.368048191070557, 'Man': 88.631945...",pretrained_deepface,"[-0.20426703989505768, 0.008542572148144245, -...","[-0.20426704, 0.008542572, -0.046812143, 0.014...",0.690365


In [76]:
# # Combined image + text search
recommend_similar_images(query_text="feet", query_image_url="https://public.onlyfans.com/files/v/vk/vk5/vk5bd4pkhuxw0bgeklvlpc5yitjrwz211586828835/avatar.jpg", top_k=5)


Unnamed: 0,id,created_at,username,avatar,gender,confidence_score,predicted_by,embeddings,embedding_array,similarity_score
0,1,2025-05-18T04:30:24.258725+00:00,19hislittlegoth91,https://public.onlyfans.com/files/b/bx/bxd/bxd...,man,"{'Woman': 11.30981296300888, 'Man': 88.6901855...",pretrained_deepface,"[-0.06269136816263199, 0.05155433341860771, -0...","[-0.06269137, 0.051554333, -0.30387303, 0.2713...",0.6412
2,3,2025-05-18T04:30:25.705461+00:00,777p3ach,https://public.onlyfans.com/files/7/73/73d/73d...,woman,"{'Woman': 50.043076276779175, 'Man': 49.956917...",pretrained_deepface,"[0.026222987100481987, 0.03928045928478241, 0....","[0.026222987, 0.03928046, 0.030166619, -0.2567...",0.624656
1,2,2025-05-18T04:30:24.985493+00:00,2troubledouble,https://public.onlyfans.com/files/t/t2/t2f/t2f...,man,"{'Woman': 11.368048191070557, 'Man': 88.631945...",pretrained_deepface,"[-0.20426703989505768, 0.008542572148144245, -...","[-0.20426704, 0.008542572, -0.046812143, 0.014...",0.587676
4,5,2025-05-18T04:30:26.658133+00:00,a1day1princess,https://public.onlyfans.com/files/w/we/wes/wes...,man,"{'Woman': 4.30479571223259, 'Man': 95.69520354...",pretrained_deepface,"[0.23092104494571686, -0.027042806148529053, 0...","[0.23092104, -0.027042806, 0.007916119, -0.209...",0.58181
3,4,2025-05-18T04:30:25.990874+00:00,a1bandzino_,https://public.onlyfans.com/files/n/no/nod/nod...,man,"{'Woman': 12.24442571401596, 'Man': 87.7555727...",pretrained_deepface,"[0.04114340990781784, -0.4302443265914917, 0.1...","[0.04114341, -0.43024433, 0.15611474, 0.119282...",0.517971


In [62]:
#### Generating search ####
compiled_df_valid_embeddings = compiled_df_valid[compiled_df_valid['embeddings'].apply(lambda x: isinstance(x, np.ndarray) and x.shape == (512,))].copy()


In [63]:
compiled_df_valid_embeddings

Unnamed: 0,username,gender,confidence_score,avatar,predicted_by,embeddings
0,19hislittlegoth91,man,"{'Woman': 11.30981296300888, 'Man': 88.6901855...",https://public.onlyfans.com/files/b/bx/bxd/bxd...,pretrained_deepface,"[-0.06269137, 0.051554333, -0.30387303, 0.2713..."
2,2troubledouble,man,"{'Woman': 11.368048191070557, 'Man': 88.631945...",https://public.onlyfans.com/files/t/t2/t2f/t2f...,pretrained_deepface,"[-0.20426704, 0.008542572, -0.046812143, 0.014..."
3,777p3ach,woman,"{'Woman': 50.043076276779175, 'Man': 49.956917...",https://public.onlyfans.com/files/7/73/73d/73d...,pretrained_deepface,"[0.026222987, 0.03928046, 0.030166619, -0.2567..."
5,a1bandzino_,man,"{'Woman': 12.24442571401596, 'Man': 87.7555727...",https://public.onlyfans.com/files/n/no/nod/nod...,pretrained_deepface,"[0.04114341, -0.43024433, 0.15611474, 0.119282..."
8,a1day1princess,man,"{'Woman': 4.30479571223259, 'Man': 95.69520354...",https://public.onlyfans.com/files/w/we/wes/wes...,pretrained_deepface,"[0.23092104, -0.027042806, 0.007916119, -0.209..."


In [64]:
from supabase import create_client, Client
import os


In [65]:
compiled_df_valid_embeddings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 0 to 8
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   username          5 non-null      object
 1   gender            5 non-null      object
 2   confidence_score  5 non-null      object
 3   avatar            5 non-null      object
 4   predicted_by      5 non-null      object
 5   embeddings        5 non-null      object
dtypes: object(6)
memory usage: 280.0+ bytes


In [68]:
load_dotenv('../.env')
url = os.getenv('SUPABASE_URL')
key = os.getenv('SUPABASE_KEY')
supabase: Client = create_client(url, key)

In [70]:
import json

df_to_upload = compiled_df_valid_embeddings.copy()
df_to_upload['embeddings'] = df_to_upload['embeddings'].apply(lambda x: x.tolist())

# Convert DataFrame to a list of dicts
records = df_to_upload.to_dict(orient="records")

# Upload to Supabase table
for record in records:
    supabase.table("of_profiles").insert(record).execute()