In [12]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [9]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [11]:
import json

data = json.load(open("reviews.json"))
data['reviews']


[{'professor': 'Dr. Emily Watson',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Dr. Watson is an amazing professor who genuinely cares about her students. Her lectures are clear, and she is always available for help.'},
 {'professor': 'Prof. John Miller',
  'subject': 'Economics',
  'stars': 4,
  'review': 'Prof. Miller is very knowledgeable and explains concepts well, but his grading can be tough.'},
 {'professor': 'Dr. Sarah Lee',
  'subject': 'Mathematics',
  'stars': 3,
  'review': "Dr. Lee's lectures can be a bit dry, but she provides excellent resources for studying."},
 {'professor': 'Prof. Michael Brown',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Prof. Brown is very passionate about physics, and it shows in his teaching. His exams are challenging but fair.'},
 {'professor': 'Dr. Rebecca Green',
  'subject': 'Chemistry',
  'stars': 2,
  'review': "Dr. Green's lectures are difficult to follow, and she doesn't offer much help outside of class."},
 {'profess

In [13]:
processed_data = []
client = OpenAI()


# Create embeddings for each review
for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )


In [14]:
processed_data[0]

{'values': [-0.015436628,
  -0.0205592,
  0.008907072,
  0.06974048,
  0.016148735,
  -0.026026338,
  -0.003302106,
  0.009624921,
  -0.010641395,
  -0.0057600215,
  0.016803412,
  0.007356518,
  -0.028645052,
  -0.019950463,
  0.0012268042,
  0.048882656,
  0.012668601,
  0.016516272,
  0.0016424388,
  0.014448867,
  -0.0054642675,
  -0.038132146,
  0.018158711,
  -0.030230062,
  0.012898313,
  -0.030023322,
  -0.004720576,
  0.013231395,
  0.016596671,
  0.024762923,
  0.05774953,
  -0.011445386,
  -0.0070578926,
  -0.0066960966,
  -0.038591567,
  0.050214984,
  -0.016458845,
  -0.021765186,
  0.008849644,
  0.014391439,
  -0.027841061,
  0.011623413,
  -0.013702304,
  0.019364698,
  0.028369397,
  -0.01969778,
  -0.04192239,
  0.024533212,
  0.023373168,
  0.024004875,
  -0.02882882,
  0.0017903157,
  0.022488778,
  0.002198054,
  -0.048698884,
  0.04722873,
  0.03137862,
  0.02643982,
  0.0039050994,
  -0.026715472,
  0.050444692,
  0.014965719,
  -0.02471698,
  -0.016412903,
  -0.

In [15]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1",
)

{'upserted_count': 20}

In [16]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}