In [5]:
import google.generativeai as genai
from dotenv import load_dotenv
import os

load_dotenv()
from pinecone import Pinecone, ServerlessSpec
genai.configure(api_key=os.getenv("NEXT_PUBLIC_GEMINI_API"))

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
pc = Pinecone(api_key=os.getenv("NEXT_PUBLIC_PINECONE_KEY"))

pc.create_index(    
    name="rag", dimension=768, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': 'f0172f909a9c7b70a24d9cd9f294917d', 'Date': 'Mon, 02 Sep 2024 16:01:28 GMT', 'Server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [42]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Sarah Johnson',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Dr. Johnson is amazing at explaining complex concepts. Her lectures are clear and engaging.'},
 {'professor': 'Prof. Ahmed Khan',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Prof. Khan is very knowledgeable, though his lectures can sometimes be a bit fast-paced.'},
 {'professor': 'Dr. Emily Davis',
  'subject': 'Chemistry',
  'stars': 3,
  'review': 'Dr. Davis is good, but her grading is quite strict. You need to work hard to score well.'},
 {'professor': 'Dr. John Smith',
  'subject': 'Computer Science',
  'stars': 5,
  'review': "Dr. Smith is the best professor I've had. His programming assignments are challenging but rewarding."},
 {'professor': 'Prof. Linda Brown',
  'subject': 'History',
  'stars': 4,
  'review': "Prof. Brown knows her subject well. Her classes are interesting, but there's a lot of reading."},
 {'professor': 'Dr. Mark Thompson',
  'subject': 'Economics',
  'stars': 5,

In [43]:
processed_data = []
for review in data["reviews"]:
    response = genai.embed_content(
        model="models/text-embedding-004",
        content=review["review"],
    )
    embedding=response["embedding"]
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata":{
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [9]:
processed_data[0]

{'values': [0.05284405,
  0.010153905,
  -0.020517474,
  -0.0062435684,
  0.009113799,
  0.013385664,
  -0.010499484,
  0.013686945,
  -0.041573077,
  0.026221707,
  0.051833928,
  0.025643328,
  0.015276398,
  -0.019998193,
  -0.013441667,
  -0.07829305,
  0.013452393,
  0.014510987,
  -0.109780096,
  0.033278055,
  0.01624381,
  -0.033194985,
  0.020882556,
  -0.027429497,
  0.021268252,
  -0.014869176,
  0.017015306,
  -0.025359454,
  0.073306024,
  -0.051995587,
  0.05551562,
  0.010532271,
  0.018146561,
  -0.06296308,
  -0.07697997,
  0.021574691,
  0.0052241036,
  -0.015455886,
  0.058489956,
  -0.029103843,
  -0.036602825,
  -0.033963203,
  0.005813924,
  0.00759841,
  -0.049076106,
  0.0041022133,
  -0.00749939,
  0.08936182,
  0.028732304,
  0.07894004,
  0.06003783,
  0.0327365,
  -0.075344786,
  0.072997056,
  -0.020223657,
  -0.036012832,
  -0.037295546,
  -0.020223502,
  0.03377042,
  -0.028938077,
  -0.028694449,
  0.00053816463,
  -0.034752686,
  -0.06138192,
  0.091438

In [44]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="nsl"
)

{'upserted_count': 23}

In [12]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'nsl': {'vector_count': 20}},
 'total_vector_count': 20}

In [41]:
from bs4 import BeautifulSoup
import math
import requests


page = requests.get("https://www.ratemyprofessors.com/professor/2553649")
if page.status_code == 200:
    soup = BeautifulSoup(page.text, "lxml")
    prof_stars = round(int(soup.find("div", class_= "RatingValue__Numerator-qw8sqy-2 liyUjw").text))
    prof_name = soup.find("div", class_="NameTitle__Name-dowf0z-0 cfjPUG").text
    prof_course = soup.find("a", class_="TeacherDepartment__StyledDepartmentLink-fl79e8-0 iMmVHb").text
    words = prof_course.split()
    prof_course = " ".join(words[:-1])    
    prof_review = soup.find("div", class_="Comments__StyledComments-dzzyvm-0 gRjWel").text
    
    new_entry = {
        "professor": prof_name,
        "subject": prof_course,
        "stars": prof_stars,
        "review": prof_review
    }
    
    with open("reviews.json", "r") as file:
        data = json.load(file)
        
        data["reviews"].append(new_entry)
        
        with open("reviews.json", "w") as file:
            json.dump(data,file, indent=4)
    
else:
    print(f"Failed to retrieve page with status code: {page.status_code}")