In [33]:
from dotenv import load_dotenv
import os
import google.generativeai as genai
from pinecone import Pinecone, ServerlessSpec
load_dotenv()
import json

In [44]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"),)
pc.create_index(
    name="rag",
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'))


In [28]:
import json
data= json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Alice Smith',
  'subject': 'Data Structures',
  'stars': 4,
  'review': 'Dr. Smith explains concepts clearly and provides great examples. Could use more hands-on exercises.'},
 {'professor': 'Dr. Alice Smith',
  'subject': 'Data Structures',
  'stars': 3,
  'review': 'Good knowledge but lectures are sometimes too fast. More review sessions would help.'},
 {'professor': 'Dr. John Doe',
  'subject': 'Calculus I',
  'stars': 5,
  'review': 'Exceptional instructor. Very engaging and makes complex topics easy to understand.'},
 {'professor': 'Dr. John Doe',
  'subject': 'Calculus I',
  'stars': 4,
  'review': 'Great teacher, but office hours are always crowded. Still, his explanations are very helpful.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'World History',
  'stars': 4,
  'review': 'Dr. Johnson has a lot of knowledge and presents information well, though lectures can be lengthy.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'World History',
  'stars':

In [34]:

processed_data = []

# Initialize Google Gemini API client
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)


def get_gemini_embedding(text, model="models/text-embedding-004"):
    result = genai.embed_content(
        model=model,
        content=text,
        task_type="semantic_similarity"  # Adjust if needed
    )
    if result is not None:
        return result['embedding']
    else:
        print("Error generating text embedding with Gemini")
        return None

# Create embeddings for each review
for review in data["reviews"]:
    embedding = get_gemini_embedding(review['review'])
    if embedding is not None:
        processed_data.append(
            {
                "values": embedding,
                "id": review["professor"],
                "metadata": {
                    "review": review["review"],
                    "subject": review["subject"],
                    "stars": review["stars"],
                }
            }
        )

In [35]:
processed_data[0]

{'values': [-0.029528882,
  -0.011316048,
  -0.07558129,
  0.036319487,
  0.022836627,
  0.017526772,
  -0.011026599,
  0.051032197,
  -0.0051940805,
  -0.023931744,
  0.07274111,
  0.05968107,
  0.009243532,
  0.010471775,
  0.039073206,
  0.00519387,
  0.081534274,
  0.011726258,
  -0.04938073,
  0.0039349278,
  0.0032299957,
  -0.020892251,
  0.0061783195,
  -0.06680725,
  -0.04185384,
  0.016113883,
  0.022190487,
  -0.027746694,
  0.030529523,
  -0.011510399,
  0.05639191,
  -0.006916137,
  -0.028094374,
  -0.0418019,
  -0.00544283,
  0.0026552058,
  0.0011958794,
  -0.08018071,
  0.012133681,
  0.007827094,
  -0.019120377,
  -0.0119022755,
  -0.066602804,
  0.027488336,
  -0.061329816,
  -0.00958118,
  0.0007262533,
  0.08737018,
  0.039951276,
  0.0029384454,
  0.019220768,
  0.039072633,
  -0.022149052,
  -0.0008334773,
  -0.015782276,
  -0.056850154,
  -0.033380978,
  -0.042093474,
  0.058414716,
  0.0033239638,
  -0.074190244,
  -0.028334538,
  -0.03400565,
  -0.05693678,
  0

In [45]:
index = pc.Index('rag')
index.upsert(
  vectors=processed_data,
  namespace="ns1"
)

{'upserted_count': 22}

In [46]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 11}},
 'total_vector_count': 11}