In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2/', device='cuda')

In [None]:
def cosine_similarity(model, sentence1, sentence2):
    embeddings = model.encode([sentence1, sentence2])
    return util.cos_sim(embeddings[0], embeddings[1])

In [None]:
print(cosine_similarity(model, "Koyomi", "Araragi"))
print(cosine_similarity(model, "Karen", "Araragi"))
print(cosine_similarity(model, "Karen", "Koyomi"))
print(cosine_similarity(model, "Mayoi", "Koi"))

In [None]:
import pandas as pd
import requests

df = pd.DataFrame(columns=['anchor'])

In [None]:
df

In [None]:
response = requests.post('http://localhost:5173/api/data/filter', json={ "filters": ['Characters', 'Novels', 'Oddities', 'Oddity Characters', 'Oddity Inflicted Characters', 'Oddity Specialists', 'Oddity Type', 'Terminology', 'Vampire Hunter', 'Vampires']})
response.json()

In [None]:
items = []
for item in response.json():
    items.append({'anchor': item.get("id")})

In [None]:
df = pd.DataFrame(items, columns=['anchor', 'embedding'])

In [None]:
df['embedding'] = df['anchor'].apply(lambda x: model.encode(x))

In [None]:
df

In [None]:
# use postgres to store the embeddings
import psycopg2
from psycopg2 import sql

conn = psycopg2.connect("dbname=monogatari user=alex password=password")

## table schema:
#public.embeddings
#(
#id character varying COLLATE pg_catalog."default" NOT NULL,
#vec vector(384) NOT NULL,
#CONSTRAINT embeddings_pkey PRIMARY KEY (id)
#)

print(conn)

In [None]:
cur = conn.cursor()

# insert the embeddings
for index, row in df.iterrows():
    cur.execute(
        sql.SQL("INSERT INTO embeddings (id, vec) VALUES (%s, %s)"),
        [row['anchor'], row['embedding'].tolist()]
    )

conn.commit()

In [None]:
conn.close()

In [None]:
for index, row in df.iterrows():
    anchor, embedding = row
    print(anchor)
    similar = []
    for index_2, row_2 in df.iterrows():
        anchor2 = row_2['anchor']
        similar.append({"anchor": anchor2, "similarity": cosine_similarity(model, anchor, anchor2)})
    
    similar = pd.DataFrame(similar, columns=['anchor', 'similarity'])
    similar = similar.sort_values('similarity', ascending=False).head(5)
    print(similar)
    print()
    

In [None]:
model= SentenceTransformer('all-MiniLM-L6-v2/', device='cuda')

In [None]:
import pandas as pd

spoilers = pd.read_json("tvtropes.json")
spoilers

In [None]:
spoiler = spoilers.query('label == False').sample(1).to_dict(orient='records')[0].get("text")
spoiler

In [None]:
import re

spoiler = re.sub(r'[^\w\s]', '', spoiler)
matches = re.findall(r'(?:\w+\s*){,5}', spoiler)
matches = [match.strip() for match in matches if match != '']
matches

In [None]:
def get_similar_records(text):
    similar = []
    for _, row in df.iterrows():
        similar.append({"text": row['anchor'], "similarity": cosine_similarity(model, text, row['anchor'])})
    similar = pd.DataFrame(similar, columns=['text', 'similarity'])
    similar = similar.sort_values('similarity', ascending=False)['text'].head(5).to_dict()
    return similar

In [None]:
for match in matches:
    # print(f'finding matches for:\t{match}')
    similar = list(get_similar_records(match).values())
    print(similar)

In [None]:
for match in matches:
    print(f'finding matches for:\t{match}')
    similar = get_similar_records(match).values()
    print(similar)