In [3]:
from dotenv import load_dotenv
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec


  from tqdm.autonotebook import tqdm


In [4]:
# Load environment variables
load_dotenv()

# Create an instance of the Pinecone client
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

# Create the index using the new structure
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)


In [5]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. James Smith',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Amazing professor! Clear lectures and always willing to help.'},
 {'professor': 'Prof. Emily Johnson',
  'subject': 'Mathematics',
  'stars': 3,
  'review': 'Good teacher but the exams are difficult.'},
 {'professor': 'Dr. Michael Lee',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Engaging lectures and practical examples.'},
 {'professor': 'Prof. Sarah Williams',
  'subject': 'Chemistry',
  'stars': 2,
  'review': 'Hard to follow during lectures, but she offers a lot of resources.'},
 {'professor': 'Dr. David Brown',
  'subject': 'History',
  'stars': 5,
  'review': 'Makes history come alive! Highly recommended.'},
 {'professor': 'Prof. Jessica Martinez',
  'subject': 'Literature',
  'stars': 3,
  'review': 'Interesting readings but unclear grading criteria.'},
 {'professor': 'Dr. Richard Davis',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Challenging course but he’s very supporti

In [16]:
# Following the video min 18:58 but here I am getting an error:
# NameError: name 'OpenAI' is not defined
# Processed data storage
processed_data = []

client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small"  # Correct model name for embedding generation
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })


In [17]:
# Insert the embeddings into the Pinecone index
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

# Print index statistics
print(index.describe_index_stats())

Upserted count: 20
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
