In [10]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec


In [7]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [8]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Dr. Johnson is always well-prepared, provides comprehensive lectures, and encourages critical thinking.'},
 {'professor': 'Dr. Michael Gray',
  'subject': 'Mathematics',
  'stars': 4,
  'review': "Professor Gray's explanations are clear, and he often stays after class to address students' questions."},
 {'professor': 'Dr. Samantha Nguyen',
  'subject': 'Physics',
  'stars': 3,
  'review': "Dr. Nguyen's lectures are rich in content, but they can be dense and fast-paced at times."},
 {'professor': 'Dr. Rahul Chopra',
  'subject': 'Data Science',
  'stars': 5,
  'review': 'His real-world examples make complex data modeling topics easier to understand.'},
 {'professor': 'Dr. Beatrice Kim',
  'subject': 'Literature',
  'stars': 2,
  'review': 'She has interesting insights, but her lecture structure can be a bit disorganized.'},
 {'professor': 'Dr. Javier Morales',
  'subject': 'Chemistry',
  'sta

In [11]:
from torch import embedding


processed_data = []
client = OpenAI()

for reviews in data['reviews']:
    response = client.embeddings.create(
        input=reviews['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": reviews["professor"],
        "metadata":{
            "review": reviews["review"],
            "subject": reviews["subject"],
            "stars": reviews["stars"]
        }
    })

In [12]:
processed_data[0]

{'values': [0.011846266686916351,
  -0.005602142307907343,
  0.03495087847113609,
  0.03868113458156586,
  0.011407015845179558,
  -0.007717305328696966,
  0.03200451657176018,
  0.056602586060762405,
  0.016556391492486,
  0.023097854107618332,
  0.03654569759964943,
  -0.009116151370108128,
  -0.007987613789737225,
  0.020286647602915764,
  0.021705767139792442,
  0.022084198892116547,
  -0.048817701637744904,
  0.0015492051606997848,
  0.042005930095911026,
  0.066225565969944,
  0.028814878314733505,
  -0.024084480479359627,
  0.005159512162208557,
  -0.02789582870900631,
  -0.036707885563373566,
  -0.020178524777293205,
  0.0017468682490289211,
  0.024652129039168358,
  0.02262481488287449,
  -0.00857553444802761,
  0.10028442740440369,
  0.0038214854430407286,
  -0.006818530149757862,
  0.020016338676214218,
  -0.030193451792001724,
  0.03989752382040024,
  0.012373368255794048,
  -0.018786435946822166,
  0.022354505956172943,
  0.013015350326895714,
  -0.004699987825006247,
  -0

In [13]:
from blinker import Namespace


index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    Namespace="ns1"
)

{'upserted_count': 20}

In [14]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 20}},
 'total_vector_count': 20}