In [1]:
import firebase_admin
from firebase_admin import credentials, firestore
import openai
import numpy as np
from scipy.spatial.distance import cosine
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# OpenAI API Key

openai.api_key = os.getenv("OPENAI_KEY")


In [90]:
def get_embedding(text, model="text-embedding-3-small"):
    """Generate embedding for given text using the updated OpenAI API."""
    if not text or not isinstance(text, str):
        raise ValueError("Input text must be a non-empty string.")
    response = openai.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

# Research filter stuff

In [112]:
RESEARCH_ANCHORS = [
    "This text is about some form of research, study, or academic investigation.",
    "We are discussing data collection, experiments, or analysis for a study.",
    "This post references scientific or scholarly work, like a paper, article, or peer-reviewed research.",
    "We plan to investigate a hypothesis, gather data, or analyze results for academic or scientific purposes.",
    "This text is an inquiry into or exploration of topics such as cognition, social media, philosophy, physics, social sciences, biology, web3, or mathematics.",
    "This text mentions publishing findings, writing a paper, or conducting an experiment.",
    "This content involves scholarly sources, references, or frameworks often used in academia.",
    "A person is doing a project or study to learn something new, test a theory, or gather evidence.",
    "This post indicates researching a topic, exploring data, or studying a phenomenon in a systematic way."
]

In [110]:
def compute_research_similarities(post_text, anchor_embeddings):
    """
    Given a post and a list of precomputed anchor embeddings,
    return the maximum and average similarity scores.
    """
    post_embedding = get_embedding(post_text)
    
    similarities = []
    for anchor_emb in anchor_embeddings:
        sim = 1 - cosine(post_embedding, anchor_emb)
        similarities.append(sim)
    
    max_sim = max(similarities)
    avg_sim = np.mean(similarities)
    return max_sim, avg_sim

In [113]:
RESEARCH_ANCHORS_EMBEDDINGS = [get_embedding(anchor) for anchor in RESEARCH_ANCHORS]


In [114]:
post1 = "Social media is the source of brain rot, and I'd like to investigate how it affects cognition."
post2 = "I love cats and want to share cute pictures of them."

# Post 1
max_sim_1, avg_sim_1 = compute_research_similarities(post1, RESEARCH_ANCHORS_EMBEDDINGS)
print("Post #1:", post1)
print("Max similarity:", max_sim_1)
print("Average similarity:", avg_sim_1)
print("--------")

# Post 2
max_sim_2, avg_sim_2 = compute_research_similarities(post2, RESEARCH_ANCHORS_EMBEDDINGS)
print("Post #2:", post2)
print("Max similarity:", max_sim_2)
print("Average similarity:", avg_sim_2)

Post #1: Social media is the source of brain rot, and I'd like to investigate how it affects cognition.
Max similarity: 0.47694739016820775
Average similarity: 0.2864055067911029
--------
Post #2: I love cats and want to share cute pictures of them.
Max similarity: 0.12346057104249197
Average similarity: 0.10074181780566627


# Re-post parsing

In [87]:
post = """
EthDenver is coming! I'm going 2025
"""

event = """
I am going to an event, conference, seminar, festival, you should come  
"""

In [88]:
embedding1 = get_embedding(post).data[0].embedding


In [82]:
embedding2 = get_embedding(event).data[0].embedding

In [89]:
similarity = 1 - cosine(embedding1, embedding2)
print(similarity)

0.38469917527814435


# Anchors 

In [92]:
def compute_max_and_average_similarity(post_text, anchor_embeddings):
    """
    Given a post and a list of precomputed anchor embeddings,
    return the maximum and average similarity scores.
    """
    post_embedding = get_embedding(post_text)
    
    # Compute cosine similarity for each anchor
    similarities = []
    for anchor_emb in anchor_embeddings:
        sim = 1 - cosine(post_embedding, anchor_emb)
        similarities.append(sim)
    
    max_sim = max(similarities)
    avg_sim = np.mean(similarities)
    return max_sim, avg_sim


In [95]:
#Define multiple anchors for "event-like" text
EVENT_ANCHORS = [
    "This post is about an event (conference, seminar, festival, or workshop).",
    "Join us for an upcoming event, meetup, or conference on a specific date or time.",
    "We are hosting a hackathon, convention, or festival and inviting people to attend.",
    "Save the date for our upcoming conference, summit, or gathering.",
    "Don’t miss our upcoming event like a seminar, fair, or workshop with start and end dates.",
    "This post announces or invites people to a event with a name.",
    "This text talks about an event, with words like registration, tickets, sign up, or schedule.",
    "This post contains references to meetups, hackathons, seminars, or workshops (often with dates, locations, or RSVP links).",
]


In [96]:
# Pre-compute the event anchors' embeddings (only once)
EVENT_ANCHORS_EMBEDDINGS = [get_embedding(anchor) for anchor in EVENT_ANCHORS]


In [106]:
post = "AGI is not happening"

max_similarity, avg_similarity = compute_max_and_average_similarity(post, EVENT_ANCHORS_EMBEDDINGS)

print("Post:", post)
print("Max similarity with event anchors:", max_similarity)
print("Average similarity with event anchors:", avg_similarity)

Post: AGI is not happening
Max similarity with event anchors: 0.17634680904730404
Average similarity with event anchors: 0.1437441768450703
