In [20]:
from dotenv import load_dotenv
load_dotenv()

import os
import google.generativeai as genai
from pinecone import Pinecone, ServerlessSpec

In [21]:
pc = Pinecone(os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=768,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [22]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Computer Science',
  'stars': 5,
  'review': "Dr. Johnson is an amazing professor! Her lectures are clear, and she's always willing to help outside of class."},
 {'professor': 'Dr. Michael Smith',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Good professor with a deep understanding of the subject, but sometimes the lectures can be a bit fast-paced.'},
 {'professor': 'Dr. Linda Brown',
  'subject': 'Physics',
  'stars': 5,
  'review': 'Very engaging lectures and thorough explanations. Highly recommend taking her classes.'},
 {'professor': 'Dr. James Williams',
  'subject': 'Chemistry',
  'stars': 3,
  'review': 'The material is challenging, and the exams are tough. Dr. Williams is knowledgeable but not very approachable.'},
 {'professor': 'Dr. Patricia Davis',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Solid professor with clear lectures. Sometimes difficult to follow the labs, but overall a good experience.'},
 {'profe

In [23]:
import os

processed_data = []
model = "models/embedding-001"
gemini_api = os.getenv("GEMINI_API_KEY")
client = genai.configure(api_key=gemini_api)

for review in data["reviews"]:
    embedding = genai.embed_content(model=model,
                                content=review["review"],
                                task_type="retrieval_document",
                                title="Professor reviews")
    
    processed_data.append({
        "values" : embedding["embedding"],
        "id" : review["professor"],
        "metadata" : {
            "review" : review["review"],
            "subject" : review["subject"],
            "stars" : review["stars"],
        }
    })

In [24]:
processed_data[0]

{'values': [0.07007094,
  -0.023314208,
  0.021203311,
  -0.045950316,
  0.028897638,
  0.049037576,
  0.010067357,
  -0.0020516084,
  0.00048514755,
  0.01851026,
  -0.05239013,
  -0.014256596,
  -0.012514922,
  0.030114835,
  0.0160742,
  -0.011741203,
  0.0078880945,
  -0.0087493695,
  -0.032592047,
  0.004424307,
  0.0069757826,
  -0.0451859,
  0.054481342,
  -0.047891535,
  0.013787645,
  -0.03179486,
  0.014875305,
  -0.036023647,
  0.032768454,
  -0.0156482,
  -0.061089814,
  0.000115662944,
  -0.016428534,
  0.033133164,
  -0.00048535096,
  -0.024506511,
  0.030742135,
  -0.020713829,
  -0.010737008,
  0.041642528,
  0.0125393905,
  -0.036110427,
  -0.02951852,
  0.034744922,
  0.024395736,
  0.0008132703,
  -0.0069396766,
  0.0022072832,
  0.022519724,
  -0.040315166,
  0.03720598,
  0.024426982,
  0.05500181,
  0.0042566666,
  0.026254414,
  -0.023381568,
  0.052016914,
  -0.048656642,
  -0.031201085,
  0.02155857,
  -0.0053261546,
  0.0007840299,
  0.01720025,
  -0.023800835

In [25]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [26]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}