In [13]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone,ServerlessSpec


In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws",region="us-east-1"))

In [5]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Emma Thompson',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Engaging lectures, but tough grader.'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Excellent at explaining complex concepts.'},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Biology',
  'stars': 3,
  'review': 'Knowledgeable, but often rushes through material.'},
 {'professor': 'Prof. David Lee',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Clear explanations and helpful office hours.'},
 {'professor': 'Dr. Rachel Martinez',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Passionate about the subject, great lab instructor.'},
 {'professor': 'Prof. James Wilson',
  'subject': 'History',
  'stars': 2,
  'review': 'Lectures are dry, needs more interactive elements.'},
 {'professor': 'Dr. Emily Brown',
  'subject': 'Psychology',
  'stars': 4,
  'review': 'Interesting case studies, fair assessments.'},
 {'professor': 'Prof. Alexandra

In [14]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model = "text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values":embedding,
        "id":review["professor"],
        "metadata":{
            "review":review["review"],
            "subject":review["subject"],
            "stars":review["stars"]

        }
    })

In [15]:
processed_data[0]

{'values': [-0.033003684,
  0.035098724,
  -0.006713651,
  -0.023099858,
  0.009529712,
  -0.0073326402,
  0.0074890885,
  -0.026296835,
  0.020093339,
  0.016243363,
  0.02111365,
  0.013522532,
  -0.056865375,
  -0.00020810109,
  0.025317336,
  -0.00051143125,
  -0.018542465,
  -0.024868399,
  0.0034503543,
  0.03466339,
  0.019127443,
  -0.026568918,
  0.06655154,
  0.011325461,
  -0.018610487,
  -0.06100104,
  0.00805366,
  -0.0112506375,
  0.033520643,
  0.03626868,
  0.0337111,
  -0.025521398,
  0.015372697,
  -0.054552667,
  -0.023290316,
  0.040377136,
  0.016161738,
  0.058280207,
  0.035942182,
  0.020025318,
  0.02320869,
  0.021331318,
  -0.018841757,
  -0.0059790267,
  0.050226547,
  -0.014910156,
  0.011849221,
  -0.043696553,
  0.044784885,
  0.005009731,
  -0.06279679,
  0.056484457,
  0.058334623,
  0.017658195,
  -0.03001077,
  -0.020215778,
  -0.01944034,
  0.049328674,
  -0.010413982,
  0.0040982524,
  0.013155219,
  -0.02605196,
  0.02120888,
  -0.03724818,
  -0.00

In [16]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [17]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}