In [17]:
import pandas as pd
from openai import OpenAI
import tiktoken
import chromadb 
from collections import defaultdict

In [2]:
f_audience = pd.read_csv('dataset/f_audience.csv', index_col=1)
f_critics = pd.read_csv('dataset/f_critics.csv', index_col=1)

In [3]:
r_audience = pd.read_csv('dataset/audience_reviews_clean_v2.csv', index_col=0)
r_critics = pd.read_csv('dataset/critics_reviews_clean_v2.csv', index_col=0)

In [4]:
ra_fa = r_audience.loc[f_audience.index.intersection(r_audience.index)]
ra_fc = r_audience.loc[f_critics.index.intersection(r_audience.index)]
rc_fa = r_critics.loc[f_audience.index.intersection(r_critics.index)]
rc_fc = r_critics.loc[f_critics.index.intersection(r_critics.index)]

In [5]:
oai = OpenAI()
def get_embeddings(texts):
    response = oai.embeddings.create(
        input=texts,
        model="text-embedding-3-small"
    )
    return response.data

In [6]:
encoding = tiktoken.encoding_for_model("text-embedding-3-small")
def get_token_count(s):
    return len(encoding.encode(s))
def get_corpus_token_count(corpus):
    total = 0
    for text in corpus:
        total += get_token_count(text)
    return total

In [7]:
get_corpus_token_count(r_audience.review_content.values) + get_corpus_token_count(r_critics.review_content.values)

1939126

In [31]:
client = chromadb.PersistentClient('embeddings')
openai_ef = chromadb.utils.embedding_functions.OpenAIEmbeddingFunction(
        model_name="text-embedding-3-small"
    )
c_ra_fa = client.create_collection(
                'ra_fa', 
                embedding_function=openai_ef, 
                metadata={"hnsw:space": "cosine"})
c_ra_fc = client.create_collection(
                'ra_fc', 
                embedding_function=openai_ef, 
                metadata={"hnsw:space": "cosine"})
c_rc_fa = client.create_collection(
                'rc_fa', 
                embedding_function=openai_ef, 
                metadata={"hnsw:space": "cosine"})
c_rc_fc = client.create_collection(
                'rc_fc', 
                embedding_function=openai_ef, 
                metadata={"hnsw:space": "cosine"})

In [63]:
batch_size = 2000
def load_corpus_into_db(corpus, collection):
    used = defaultdict(int)
    for i in range(0, len(corpus), batch_size):
        batch = corpus.iloc[i:i + batch_size]
        ids = list(batch.index.values)
        new_ids = []
        for mid in ids:
            new_id = f"{mid}/{used[mid]}"
            new_ids.append(new_id)
            used[mid] += 1
        collection.add(
            ids=new_ids,
            documents=list(batch.review_content.values)
        )

In [64]:
load_corpus_into_db(ra_fa, c_ra_fa)
load_corpus_into_db(ra_fc, c_ra_fc)
load_corpus_into_db(rc_fc, c_rc_fc)
load_corpus_into_db(rc_fa, c_rc_fa)

In [80]:
c_ra_fc.query(query_texts=['horror'])

{'ids': [['m/little_evil/19',
   'm/bedlam/6',
   'm/patrick_evil_awakens/0',
   'm/the_dead_center/1',
   'm/dig_two_graves_2017/19',
   'm/10008613-burrowers/0',
   'm/the_funhouse_massacre/1',
   'm/antibirth/18',
   'm/arachnophobia/5',
   'm/we_are_what_we_are_2013/28']],
 'distances': [[0.4080613851547241,
   0.4525904059410095,
   0.4588698744773865,
   0.46502822637557983,
   0.4681416153907776,
   0.47131937742233276,
   0.47191721200942993,
   0.47357577085494995,
   0.4797324538230896,
   0.4826468825340271]],
 'metadatas': [[None, None, None, None, None, None, None, None, None, None]],
 'embeddings': None,
 'documents': [['great horror satire.',
   'not really a horror movie.',
   'that so called horror was laughable.',
   'psychological horror at its best!',
   'a pretty meh, "horror" movie.',
   'good western horror & period piece.',
   "fun horror comedy that doesn't take itself too seriously.",
   'budget gross-out horror.',
   'an enjoyable creature horror film',
   'a