In [39]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [41]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [42]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Sarah Johnson',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Great lectures and always available for questions. Could be clearer on assignments.'},
 {'professor': 'Prof. Michael Lee',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Amazing professor! Makes difficult concepts easy to understand.'},
 {'professor': 'Dr. Emily Turner',
  'subject': 'Physics',
  'stars': 3,
  'review': 'Good lectures, but not very responsive outside of class.'},
 {'professor': 'Prof. David Thompson',
  'subject': 'Chemistry',
  'stars': 2,
  'review': 'Expects too much without clear guidance. Tough exams.'},
 {'professor': 'Dr. Jessica Collins',
  'subject': 'History',
  'stars': 5,
  'review': 'Engaging and passionate about the subject. Highly recommended!'},
 {'professor': 'Prof. Robert Garcia',
  'subject': 'Philosophy',
  'stars': 4,
  'review': 'Thought-provoking lectures, but grading is a bit strict.'},
 {'professor': 'Dr. Amanda Brown',
  'subject': 'Economic

In [43]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [44]:
processed_data[0]

{'values': [-0.02329730987548828,
  0.012166043743491173,
  -0.019158193841576576,
  -0.005931500811129808,
  0.0020529276225715876,
  -0.006426716689020395,
  -0.011116482317447662,
  -0.024686869233846664,
  0.03671986982226372,
  -0.0006638292106799781,
  0.026963382959365845,
  -0.00702910590916872,
  -0.008004754781723022,
  -0.0573858842253685,
  0.00032590917544439435,
  -0.0038619430270045996,
  0.0034406401682645082,
  0.008596057072281837,
  0.03172336518764496,
  0.05785892531275749,
  0.02663816697895527,
  -0.024258175864815712,
  0.004320202395319939,
  0.033526837825775146,
  -0.07893884927034378,
  -0.06693542003631592,
  0.015787770971655846,
  -0.010436484590172768,
  0.007871711626648903,
  0.05910066142678261,
  0.07190235704183578,
  -0.024642521515488625,
  -0.008699534460902214,
  -0.002440969692543149,
  -0.050438083708286285,
  0.06740845739841461,
  0.010916918516159058,
  0.02878163754940033,
  0.013141692616045475,
  0.016393855214118958,
  0.011404742486774

In [45]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [46]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}