In [7]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [3]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [4]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Thompson',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Dr. Thompson explains complex concepts clearly and is always willing to help outside of class.'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': "Brilliant instructor! Prof. Chen's passion for coding is contagious. Challenging but rewarding classes."},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Biology',
  'stars': 3,
  'review': 'Dr. Johnson knows her stuff, but the pace of the class can be overwhelming at times.'},
 {'professor': 'Prof. David Martinez',
  'subject': 'History',
  'stars': 4,
  'review': 'Engaging lectures and thought-provoking discussions. Prof. Martinez brings history to life.'},
 {'professor': 'Dr. Rachel Kim',
  'subject': 'Psychology',
  'stars': 5,
  'review': 'Dr. Kim is an exceptional educator. Her research-based approach and real-world examples are invaluable.'},
 {'professor': 'Prof. James Wilson',
  'subject': 'Mathe

In [8]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'], 
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding, 
        "id": review ["professor"], 
        "metadata": {
            "review": review["review"], 
            "subject": review["subject"], 
            "stars": review["stars"]
        }
    })

In [9]:
processed_data[0]

{'values': [-0.02644188,
  -0.0022904405,
  -0.021231592,
  0.046529092,
  0.009767609,
  0.010063801,
  0.013517132,
  0.016492518,
  -0.008273185,
  0.001427949,
  0.040416766,
  0.024260828,
  0.0006007988,
  -0.038989656,
  0.0071288063,
  0.028272886,
  0.019562144,
  0.00034331362,
  0.022752943,
  0.07334795,
  0.010413847,
  0.015550087,
  0.022820259,
  -0.033281226,
  -0.057138156,
  -0.059023015,
  0.012487192,
  0.033065815,
  -0.017394558,
  -0.013416158,
  0.10124386,
  -0.033604346,
  0.01752919,
  -0.024610875,
  -0.029323023,
  0.026428416,
  -0.0071086115,
  0.020464184,
  0.007189391,
  0.005977696,
  0.004203909,
  0.02792284,
  -0.034789115,
  0.007525973,
  0.008831911,
  -0.040389836,
  -0.041816946,
  -0.0057622837,
  0.024220439,
  0.020127602,
  -0.020194918,
  0.029511508,
  0.023628054,
  0.009585855,
  -0.0838493,
  0.039070435,
  0.0067013474,
  0.028542152,
  0.01564433,
  -0.011686127,
  0.045936707,
  0.0086703515,
  -0.0024217074,
  -0.00031175907,
  -

In [11]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data, 
    namespace="ns1"
)

{'upserted_count': 20}

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}