In [20]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [12]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag2", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [13]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Dr. Johnson explains complex concepts clearly and makes physics interesting. Her problem-solving sessions are particularly helpful.'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Literature',
  'stars': 5,
  'review': "Prof. Chen's passion for literature is contagious. His classes are always engaging and thought-provoking. Highly recommended!"},
 {'professor': 'Dr. Sarah Rodriguez',
  'subject': 'Computer Science',
  'stars': 3,
  'review': 'Dr. Rodriguez knows her stuff, but her teaching style can be a bit dry. The assignments are challenging but fair.'},
 {'professor': 'Prof. David Lee',
  'subject': 'Mathematics',
  'stars': 2,
  'review': "Prof. Lee's lectures are hard to follow. He often seems unprepared and struggles to answer students' questions clearly."},
 {'professor': 'Dr. Amanda Taylor',
  'subject': 'Psychology',
  'stars': 5,
  'review': 'Dr. Taylor is an outstanding professor! S

In [21]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [22]:
processed_data[0]

{'values': [-0.0003686227,
  -0.007916346,
  0.01361783,
  0.029922074,
  0.027364265,
  0.0047869603,
  -0.008509357,
  0.042468198,
  0.0067410404,
  0.016818663,
  0.04143936,
  0.0013512445,
  -0.04792676,
  -0.008373608,
  0.019019235,
  0.047955338,
  -0.03832426,
  0.0025095816,
  0.028793208,
  0.060587198,
  0.019762287,
  -0.018847764,
  0.028278789,
  -0.017790344,
  -0.03626658,
  -0.03609511,
  0.048869863,
  0.006658876,
  0.035466373,
  -0.011274363,
  0.10231234,
  -0.005383544,
  -0.017304504,
  -0.024935061,
  -0.016075613,
  0.034694742,
  -0.0023988385,
  0.016904399,
  0.009488184,
  0.019376472,
  -0.0032812112,
  0.0011252929,
  -0.05852952,
  0.009852564,
  0.030750861,
  0.008259293,
  -0.021719938,
  -0.0153611405,
  0.045126032,
  0.028636025,
  -0.0052656564,
  0.028278789,
  0.01763316,
  -0.033437274,
  -0.03655237,
  0.029064707,
  0.031465333,
  0.026335426,
  0.0050798934,
  -0.027164213,
  0.028364526,
  -0.019204998,
  -0.009695381,
  0.028536,
  -0.0

In [24]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 7}

In [25]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 114},
                'ns1': {'vector_count': 7},
                'youtube-videos': {'vector_count': 52}},
 'total_vector_count': 173}