In [11]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [7]:
pc = Pinecone(api_key= os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [9]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Introduction to Computer Science',
  'stars': 4,
  'review': 'Great professor, explains concepts clearly and is very approachable.'},
 {'professor': 'Dr. Jane Doe',
  'subject': 'Data Structures',
  'stars': 5,
  'review': 'Amazing lectures and very helpful during office hours.'},
 {'professor': 'Dr. Alan Turing',
  'subject': 'Algorithms',
  'stars': 3,
  'review': 'Challenging course, but the professor is knowledgeable.'},
 {'professor': 'Dr. Grace Hopper',
  'subject': 'Computer Systems',
  'stars': 4,
  'review': 'Very organized and always available to help.'},
 {'professor': 'Dr. Ada Lovelace',
  'subject': 'Discrete Mathematics',
  'stars': 5,
  'review': 'Incredible teacher, made complex topics easy to understand.'},
 {'professor': 'Dr. Barbara Liskov',
  'subject': 'Software Engineering',
  'stars': 2,
  'review': 'Course was too fast-paced, and the professor was hard to follow.'},
 {'professor': 'Dr. Donald Knuth',
  'subject': 'T

In [12]:
processed_data = []
client = OpenAI()
# embedding 
# embedding capture semantics of a text or word in numerical form
# convert reviews into embedding
for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })



In [13]:
processed_data[0]

{'values': [-0.04109343,
  -0.030370235,
  -0.040680066,
  0.025239635,
  -0.0004779543,
  -0.025142372,
  -0.023099858,
  0.038078293,
  -0.028862664,
  -0.04495962,
  0.02041298,
  -0.010334147,
  -0.015647115,
  0.0015524017,
  0.0052977703,
  -0.007568243,
  -0.01243745,
  0.008777946,
  0.025118057,
  0.039974913,
  0.013081814,
  -0.0067293537,
  0.010328068,
  0.046005193,
  -0.055245135,
  -0.03173191,
  0.017774733,
  0.01706958,
  -0.016327953,
  0.008540869,
  0.08529927,
  -0.009999807,
  -0.031099705,
  -0.056315023,
  -0.028327722,
  0.044667833,
  -0.017713943,
  0.01801789,
  -0.012182136,
  0.023197122,
  0.0132763395,
  0.04362226,
  -0.013969336,
  -0.013081814,
  0.030467497,
  -0.004708116,
  -0.048947383,
  -0.0058600693,
  0.010297674,
  0.05004159,
  -0.029494872,
  -0.022601388,
  0.04116638,
  0.015513379,
  -0.062004883,
  0.034795683,
  0.025580054,
  -0.003747648,
  0.010011965,
  -0.027014676,
  0.048388124,
  0.0012021045,
  0.0024133273,
  -0.010723198,


In [14]:
# index is like collection and namespace is a documeny
# add to out database
index = pc.Index('rag')

index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [15]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}