In [7]:
import json

with open('credentials.json') as f:
    data = json.load(f)
    api_key = data['api_key']
    openai_api_key = data['openai_api_key']

In [8]:
from pinecone import Pinecone

pc = Pinecone(api_key=api_key)
index = pc.Index("emoji")


In [9]:

with open("emoji.txt") as f:
    emoji = [line.split("#")[1].strip().split(" ", 2) for line in f if "fully-qualified" in line and not line.startswith("#")]

emoji = [e for e in emoji if len(e) == 3]

print(len(emoji))

1897


In [10]:
from openai import AsyncOpenAI

async_client = AsyncOpenAI(
    api_key=openai_api_key,
)

In [11]:
def remove_special_characters(value: str):
    return value.replace("’", "-").replace("ñ", "n").replace("“", "\"").replace("”", "\"").replace("Å", "A").replace("é", "e").replace("ô", "o").replace("ç", "c").replace("ã", "a").replace("í", "i").replace("ü", "u")

In [12]:
import asyncio
from asynciolimiter import Limiter

limiter = Limiter(500/60)

async def analyze(e):
	await limiter.wait()
	chat = await async_client.chat.completions.create(messages=[
		{"role": "system", "content": "You are a helpful assistant that analyzes emoji. You will receive a message with an emoji and it's canonical name. You will respond with a description of the emoji and a short 1-2 sentence description of how it is used."},
		{"role": "user", "content": f"emoji: {e[0]}\name: {e[2]}"},
	], model="gpt-3.5-turbo", temperature=0.3)
	resp = chat.choices[0].message.content
	embedding = await async_client.embeddings.create(input=resp, model="text-embedding-3-small")

	return {
		"id": remove_special_characters(e[2]),
		"values": embedding.data[0].embedding,
		"metadata": { "edition": float(e[1].removeprefix("E")), "name": e[2], "emoji": e[0], "description":  resp }
	}

tasks = [analyze(emo) for emo in emoji]
completed_tasks = await asyncio.gather(*tasks, return_exceptions=True)


In [14]:
index.upsert(vectors=completed_tasks, batch_size=100, show_progress=False)


In [20]:
query = "relaxation"
from openai import OpenAI
client = OpenAI(
    api_key=openai_api_key
)
v = client.embeddings.create(input=[query], model="text-embedding-3-small").data[0].embedding

results = index.query(vector=v, top_k=10, include_metadata=True, include_values=False)

for result in results["matches"]:
    print(result["metadata"]["emoji"], result["score"])
    
print(results["matches"][0])

💆‍♂️ 0.40551883
🧖‍♀️ 0.39030984
🧖 0.35707128
🧖‍♂️ 0.345063329
💆 0.337619901
😌 0.332396418
🧘‍♂️ 0.323039919
🧘‍♀️ 0.322773635
💆‍♀️ 0.31727761
🧘 0.309738398
{'id': 'man getting massage',
 'metadata': {'description': 'This emoji depicts a man getting a massage, with '
                             'his eyes closed and a serene expression. It is '
                             'often used to represent relaxation, self-care, '
                             'spa days, or getting pampered.',
              'edition': 4.0,
              'emoji': '💆\u200d♂️',
              'name': 'man getting massage'},
 'score': 0.40551883,
 'values': []}
