In [8]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [7]:
import json
data=json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Emily Thompson',
  'subject': 'Biology',
  'stars': 4,
  'review': "Dr. Thompson's lectures are engaging and well-structured. She's always willing to help during office hours."},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Brilliant instructor! Prof. Chen makes complex algorithms easy to understand. His projects are challenging but rewarding.'},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Psychology',
  'stars': 3,
  'review': 'Dr. Johnson knows her material well, but her tests can be quite difficult. More practice exercises would be helpful.'},
 {'professor': 'Prof. David Martinez',
  'subject': 'History',
  'stars': 4,
  'review': "Prof. Martinez's passion for history is contagious. His anecdotes make the subject come alive."},
 {'professor': 'Dr. Rachel Lee',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Dr. Lee is an exceptional educator. Her lab demonstrations are always fascinating and safety is

In [15]:
processed_data=[]
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [16]:
processed_data[0]

{'values': [0.015962591,
  0.0067897593,
  0.0322447,
  0.047714703,
  0.009106265,
  0.027198978,
  0.02843711,
  0.012194941,
  -0.008360723,
  0.004246928,
  -0.002804104,
  0.015909338,
  0.025028918,
  -0.0388747,
  0.005671446,
  0.037916146,
  0.017706627,
  -0.022086687,
  0.025095483,
  0.057886027,
  0.042203017,
  -0.011915362,
  0.03589253,
  -0.034401447,
  -0.04688928,
  -0.04848687,
  -0.00040293395,
  0.026320303,
  -0.017693315,
  0.00361122,
  0.06214627,
  -0.03141928,
  0.010630634,
  -0.03330976,
  -0.054078437,
  0.043800604,
  -0.006476898,
  0.01795958,
  0.033150002,
  0.0044432986,
  0.042495906,
  0.016761387,
  -0.013466356,
  0.013007049,
  0.00949235,
  -0.028783254,
  -0.030407472,
  -0.007954669,
  0.03820904,
  0.017786507,
  -0.036185425,
  0.027318796,
  0.020262772,
  -0.023923917,
  -0.08067832,
  0.022778977,
  -0.008314127,
  0.019317532,
  0.02111482,
  -0.038954582,
  0.0418835,
  -0.01010476,
  0.0055283285,
  -0.017946266,
  -0.037996028,
  -0

In [18]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [19]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}