In [47]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
import google.generativeai as genai

In [48]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=768, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [52]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Biology',
  'stars': 4,
  'review': "Dr. Johnson's lectures are engaging and well-structured. She's always willing to help during office hours."},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Brilliant teacher! Prof. Chen makes complex concepts easy to understand and provides great real-world examples.'},
 {'professor': 'Dr. Sarah Thompson',
  'subject': 'Psychology',
  'stars': 3,
  'review': 'Dr. Thompson knows her subject well, but her lectures can be a bit dry. More interactive sessions would be helpful.'},
 {'professor': 'Prof. David Rodriguez',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Prof. Rodriguez is a math genius! His problem-solving techniques have really improved my analytical skills.'},
 {'professor': 'Dr. Lisa Patel',
  'subject': 'Chemistry',
  'stars': 4,
  'review': "Dr. Patel's lab sessions are well-organized and informative. She ensures safety while mak

In [54]:
processed_data = []

for review in data['reviews']:    
    genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
    result = genai.embed_content(
        model="models/text-embedding-004",
        content=review['review'],
        task_type="retrieval_document",
        title="Embedding of single string")
    embeddings = result['embedding']
    
    processed_data.append({
        "values": embeddings,
        "id": review['professor'],
        "metadata":{
            'review': review['review'],
            'subject': review['subject'],
            'stars': review['stars'],
        }
    })

In [55]:
processed_data[0]

{'values': [0.02817522,
  0.0014556544,
  -0.04054408,
  -0.021547113,
  0.018642386,
  0.0075332155,
  0.037805468,
  0.07391174,
  -0.011450953,
  0.049247935,
  0.049592644,
  0.04445013,
  0.0653748,
  -0.004027851,
  -0.043482166,
  -0.07656166,
  0.042587515,
  -0.006708978,
  -0.090301275,
  0.030635454,
  -0.02132392,
  -0.03881977,
  0.044832066,
  -0.075058594,
  0.006850507,
  0.0017852198,
  0.024461905,
  -0.04516572,
  0.021716531,
  -0.046757545,
  0.056306597,
  0.03886994,
  0.008008241,
  -0.04811913,
  -0.014073378,
  0.037331656,
  -0.0044868467,
  0.0054549663,
  0.043537658,
  -0.040872663,
  -0.06362195,
  0.010012392,
  0.015062288,
  0.03227296,
  -0.071889244,
  -0.01710286,
  -0.01331075,
  0.06357304,
  0.0136215575,
  0.07275382,
  0.071541384,
  0.047437295,
  -0.047591183,
  0.049107708,
  -0.021920083,
  -0.049304202,
  -0.013416235,
  -0.029758545,
  0.026691511,
  -0.031692352,
  0.003912432,
  -0.027392166,
  -0.032635365,
  -0.07340407,
  0.055592094

In [56]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [58]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}