In [45]:
from dotenv import load_dotenv
load_dotenv()
import os
import json
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

In [46]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [47]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. John Smith',
  'subject': 'Introduction to Programming',
  'stars': 4,
  'review': 'Dr. Smith is very knowledgeable and explains concepts clearly. His assignments are challenging but fair.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Data Structures',
  'stars': 5,
  'review': 'Dr. Johnson makes complex topics easy to understand. Her lectures are engaging and well-organized.'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Algorithms',
  'stars': 3,
  'review': "Dr. Brown's course is tough, but he provides helpful resources. The exams are difficult, but he offers good feedback."},
 {'professor': 'Dr. Sarah Davis',
  'subject': 'Computer Systems',
  'stars': 4,
  'review': 'Dr. Davis is very approachable and her practical examples make the subject matter more interesting.'},
 {'professor': 'Dr. David Wilson',
  'subject': 'Software Engineering',
  'stars': 2,
  'review': "Dr. Wilson's course is poorly structured and lacks real-world applications. He is kn

In [48]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Process data
processed_data = []

for review in data['reviews']:
    embedding = model.encode(review['review'])
    processed_data.append({
        "values": embedding.tolist(),  # Convert numpy array to list for JSON serialization
        "id": review['professor'],
        "metadata": {
            "review": review['review'],
            "subject": review['subject'],
            "stars": review['stars'],
        }
    })

# Print processed data (or further use it as needed)
print(processed_data)

[{'values': [0.03727830573916435, 0.0033653569407761097, -0.013686032965779305, 0.00406250637024641, -0.08112864196300507, 0.024339692667126656, 0.02443348802626133, -0.008448479697108269, -0.09851639717817307, 0.10841765254735947, -0.12324757128953934, 0.07862605899572372, -0.04523025453090668, 0.07652527093887329, -0.08740367740392685, 0.04648546501994133, 0.0004011106211692095, 0.012045737355947495, -0.004911721684038639, -0.04246673360466957, 0.003564985003322363, 0.04178914800286293, 0.02935338020324707, -0.009415719658136368, -0.021909818053245544, 0.014007246121764183, -0.007520563900470734, -0.03971168026328087, 0.03940797224640846, -0.04549260064959526, -0.05007936805486679, 0.009662632830440998, 0.04178831726312637, 0.017916055396199226, -0.007730735931545496, 0.058569811284542084, 0.007535481359809637, 0.14608559012413025, -0.05455445870757103, 0.06833357363939285, 0.007600669749081135, 0.07634472846984863, 0.06730569154024124, 0.024625947698950768, 0.08138330280780792, -0.0

In [49]:
processed_data[0]

{'values': [0.03727830573916435,
  0.0033653569407761097,
  -0.013686032965779305,
  0.00406250637024641,
  -0.08112864196300507,
  0.024339692667126656,
  0.02443348802626133,
  -0.008448479697108269,
  -0.09851639717817307,
  0.10841765254735947,
  -0.12324757128953934,
  0.07862605899572372,
  -0.04523025453090668,
  0.07652527093887329,
  -0.08740367740392685,
  0.04648546501994133,
  0.0004011106211692095,
  0.012045737355947495,
  -0.004911721684038639,
  -0.04246673360466957,
  0.003564985003322363,
  0.04178914800286293,
  0.02935338020324707,
  -0.009415719658136368,
  -0.021909818053245544,
  0.014007246121764183,
  -0.007520563900470734,
  -0.03971168026328087,
  0.03940797224640846,
  -0.04549260064959526,
  -0.05007936805486679,
  0.009662632830440998,
  0.04178831726312637,
  0.017916055396199226,
  -0.007730735931545496,
  0.058569811284542084,
  0.007535481359809637,
  0.14608559012413025,
  -0.05455445870757103,
  0.06833357363939285,
  0.007600669749081135,
  0.076344

In [50]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data, 
    namespace = 'ns1',
)

{'upserted_count': 20}

In [51]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}