In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
import openai
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [3]:
import json
data = json.load(open("reviews.json"))
data["reviews"]


[{'professor': 'Dr. John Doe',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Dr. Doe explains concepts well and is always available for extra help.'},
 {'professor': 'Dr. Lisa Smith',
  'subject': 'English Literature',
  'stars': 5,
  'review': 'Dr. Smith is passionate about literature and makes the classes engaging.'},
 {'professor': 'Prof. Michael Johnson',
  'subject': 'History',
  'stars': 3,
  'review': 'Prof. Johnson knows his subject well, but his lectures can be a bit dry.'},
 {'professor': 'Dr. Emily Brown',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Dr. Brown is very knowledgeable and always encourages questions.'},
 {'professor': 'Prof. David Williams',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Prof. Williams is excellent at breaking down complex programming concepts.'},
 {'professor': 'Dr. Sarah Davis',
  'subject': 'Psychology',
  'stars': 2,
  'review': "Dr. Davis is tough on grading and doesn't provide much feedback."},
 {'professor': '

In [10]:
processed_data = []
client = openai.OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input = review["review"],
        model = "text-embedding-3-small",
    )

    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [11]:
processed_data[0]

{'values': [-0.009336256,
  -0.032660168,
  0.020278752,
  0.03482859,
  0.02936738,
  0.008218583,
  -0.0016380278,
  0.040584274,
  -0.02445497,
  -0.007181221,
  0.016223,
  -0.011096425,
  -0.027011564,
  -0.05873476,
  0.030598829,
  0.04344873,
  -0.039915007,
  -0.00054796133,
  0.03780013,
  0.0750113,
  0.006565497,
  -0.0018421538,
  0.053862505,
  -0.00821189,
  -0.029849252,
  -0.02762729,
  0.046045482,
  0.023009356,
  0.018926835,
  0.0053808964,
  0.06772969,
  -0.021148797,
  -0.029313838,
  -0.015058478,
  -0.030331122,
  0.018244183,
  -0.0113975955,
  0.02531163,
  0.013291617,
  -0.008961468,
  0.006538726,
  0.02130942,
  -0.05616478,
  -0.0024980344,
  0.026409226,
  -0.013064067,
  -0.013686484,
  -0.022219623,
  0.041548017,
  0.04264561,
  -0.047464326,
  0.017320598,
  0.019007146,
  0.017869394,
  -0.04387706,
  0.033382975,
  0.03233892,
  -0.005062995,
  0.0067662764,
  -0.039085116,
  0.055575825,
  0.01224087,
  -0.030411433,
  -0.024133721,
  -0.0172670

In [12]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [13]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}