In [6]:
import json

with open('credentials.json') as f:
    data = json.load(f)
    api_key = data['api_key']
    openai_api_key = data['openai_api_key']

In [7]:
from pinecone import Pinecone

pc = Pinecone(api_key=api_key)


In [8]:

with open("emoji.txt") as f:
    emoji = [line.split("#")[1].strip().split(" ", 2) for line in f if "fully-qualified" in line and not line.startswith("#")]

print(len(emoji))

1898


In [9]:
index = pc.Index("emoji")

In [10]:
from openai import AsyncOpenAI

async_client = AsyncOpenAI(
    api_key=openai_api_key,
)

In [11]:
import asyncio
from asynciolimiter import Limiter

limiter = Limiter(500/60)

async def analyze(e):
	await limiter.wait()
	chat = await async_client.chat.completions.create(messages=[
		{"role": "system", "content": "You are a helpful assistant that analyzes emoji. You will receive a message with an emoji and it's canonical name. You will respond with a description of the emoji and a short 1-2 sentence description of how it is used."},
		{"role": "user", "content": f"emoji: {e[0]}\name: {e[2]}"},
	], model="gpt-3.5-turbo", temperature=0.3)
	resp = chat.choices[0].message.content
	embedding = await async_client.embeddings.create(input=resp, model="text-embedding-3-small")

	return {
		"id": e[2],
		"values": embedding.data[0].embedding,
		"metadata": { "edition": float(e[1].removeprefix("E")), "name": e[2], "emoji": e[0], "description":  resp }
	}

tasks = [analyze(emo) for emo in emoji]
completed_tasks = await asyncio.gather(*tasks, return_exceptions=True)


In [12]:
print(len(completed_tasks))

5


In [13]:
for v in completed_tasks:
    v["id"] = v["id"].replace("’", "-").replace("ñ", "n").replace("“", "\"").replace("”", "\"").replace("Å", "A").replace("é", "e").replace("ô", "o").replace("ç", "c").replace("ã", "a").replace("í", "i").replace("ü", "u")

print(completed_tasks[0])

{'id': 'grinning face', 'values': [0.029121619, -0.0015660091, -0.011394021, 0.072065845, 0.027987566, -0.009216853, 0.047630217, 0.022060536, 0.023836505, -0.043714523, 0.0028699024, 0.0035091443, 0.018358815, -0.006499406, 0.0027054113, 0.06894185, -0.036910206, -0.005445593, 0.022188919, 0.015801849, 0.04728786, 0.024564011, -0.022360098, 0.07707279, 0.025676666, 0.009885517, -0.05075421, 0.0009755796, 0.014229152, 0.0075264727, -0.08824214, -0.021226045, 0.003961161, -0.049341995, 0.008772861, 0.015491589, -0.07095319, 0.020776702, 4.080935e-05, 0.0018628957, -0.01794157, -0.057943676, -0.017321052, -0.027153075, -0.031239944, -0.031282738, 0.010104838, 0.013127196, 0.054434534, 0.03286613, -0.03034126, 0.028693674, 0.030448247, 0.034149967, -0.04059053, -0.045233727, 0.020006403, 0.01263506, -0.019054227, 0.019514266, 0.031411123, 0.030640822, -0.0039023184, 0.040783104, 0.027367046, -0.02648976, -0.037680507, 0.020979976, -0.03335827, 0.006290783, 0.049812734, 0.048015367, 0.0135

In [31]:
index.upsert(vectors=completed_tasks, batch_size=100)

# print(len(c))

Upserted vectors: 100%|██████████| 1897/1897 [00:17<00:00, 109.13it/s]


{'upserted_count': 1897}

In [15]:
query = "Class pet"
from openai import OpenAI
client = OpenAI(
    api_key=openai_api_key
)
v = client.embeddings.create(input=[query], model="text-embedding-3-small").data[0].embedding

results = index.query(vector=v, top_k=10, include_metadata=True, include_values=False)

for result in results["matches"]:
    print(result["metadata"]["emoji"], result["score"])
    
print(results["matches"][0])

🐈 0.387731433
🐕 0.346858114
🐩 0.346812248
🐾 0.325074822
🐱 0.324121416
🐹 0.301057756
🐶 0.295934945
🐕‍🦺 0.295025021
🐈‍⬛ 0.290965855
😺 0.28406617
{'id': 'cat',
 'metadata': {'description': 'The emoji 🐈 depicts a cute and playful cat. It '
                             'is often used to represent pet cats, love for '
                             'cats, or to convey feelings of playfulness or '
                             'mischief.',
              'edition': 0.7,
              'emoji': '🐈',
              'name': 'cat'},
 'score': 0.387731433,
 'values': []}
