In [5]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [3]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [4]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. John Smith',
  'review': 'Incredibly knowledgeable and approachable.',
  'subject': 'Physics',
  'stars': 5},
 {'professor': 'Dr. Emily Johnson',
  'review': 'Engaging lectures but unclear on assignments.',
  'subject': 'Literature',
  'stars': 3},
 {'professor': 'Dr. Michael Brown',
  'review': 'Passionate about the subject; very helpful.',
  'subject': 'History',
  'stars': 4},
 {'professor': 'Dr. Sarah Davis',
  'review': 'Great feedback on papers but strict on deadlines.',
  'subject': 'Mathematics',
  'stars': 4},
 {'professor': 'Dr. David Wilson',
  'review': 'Fantastic at simplifying complex concepts.',
  'subject': 'Chemistry',
  'stars': 5},
 {'professor': 'Dr. Laura Miller',
  'review': 'Too much coursework and not enough support.',
  'subject': 'Biology',
  'stars': 2},
 {'professor': 'Dr. James Garcia',
  'review': 'Interesting discussions, but often goes off-topic.',
  'subject': 'Political Science',
  'stars': 3},
 {'professor': 'Dr. Patricia Martinez'

In [6]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

In [7]:
processed_data[0]

{'values': [-0.018154749646782875,
  -0.03578454256057739,
  -0.0799683928489685,
  0.029382988810539246,
  0.02691860869526863,
  0.011979218572378159,
  -0.02480420097708702,
  0.04494212195277214,
  -0.022879360243678093,
  -0.00543913384899497,
  0.04549624025821686,
  -0.001634109765291214,
  0.02672904171049595,
  -0.008749277330935001,
  -0.008924262598156929,
  0.014844606630504131,
  -0.034297168254852295,
  0.012358354404568672,
  0.058007705956697464,
  0.037855204194784164,
  0.06509461998939514,
  0.018635960295796394,
  0.004859494511038065,
  0.011045962572097778,
  0.005227692890912294,
  -0.009325271472334862,
  0.007188989315181971,
  -0.005461007356643677,
  0.01512166764587164,
  0.020458726212382317,
  0.09332562237977982,
  -0.03164321929216385,
  -0.02273353934288025,
  -0.020939936861395836,
  -0.04126742482185364,
  0.0538080558180809,
  0.014487344771623611,
  0.0056214104406535625,
  0.002690402790904045,
  0.009266942739486694,
  0.0434839092195034,
  0.0198

In [8]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1",
)

{'upserted_count': 21}

In [11]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 21}},
 'total_vector_count': 21}