In [None]:
import vertexai
from vertexai.generative_models import GenerativeModel

# TODO(developer): Update and un-comment below line
project_id = "rate-my-prof-rag-433210"

vertexai.init(project=project_id, location="us-central1")

model = GenerativeModel("gemini-1.5-flash-001")

response = model.generate_content(
    "How is PI derived? Please explain in detail."
)

print(response.text)

In [None]:
from dotenv import load_dotenv
load_dotenv()
import os
from pinecone import Pinecone, ServerlessSpec

In [None]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=768, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [None]:
import json
data = json.load(open("reviews.json"))
data

In [78]:

from typing import List, Optional
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
import json

def embed_reviews(reviews: List[dict], model_name: str = "text-embedding-004", dimensionality: Optional[int] = 768) -> List[List[float]]:
    """Embeds reviews with a pre-trained, foundational model."""
    model = TextEmbeddingModel.from_pretrained(model_name)
    
    # Ensure reviews is a list of dictionaries
    if isinstance(reviews, list) and all(isinstance(review, dict) for review in reviews):
        inputs = [TextEmbeddingInput(review['review']) for review in reviews]
        kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}
        embeddings = model.get_embeddings(inputs, **kwargs)
        return [embedding.values for embedding in embeddings]
    else:
        raise ValueError("Reviews must be a list of dictionaries.")

data = json.load(open("reviews.json"))

# Assuming `data` is your loaded JSON data
processed_data = []

# Directly access the list of reviews
reviews_list = data['reviews']

for review in reviews_list:
    professor = review['professor']
    subject = review['subject']
    stars = review['stars']
    review_text = review['review']
    
    embeddings = embed_reviews([review], model_name="text-embedding-004")

    processed_data.append({
    "values": embeddings[0], 
    "id": review["professor"], 
    "metadata": {
        "review": review["review"], 
        "subject": review["subject"], 
        "stars": review["stars"]
    }

})

In [None]:
processed_data[0]

In [None]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
    
)

In [None]:
index.describe_index_stats()

In [None]:
def calculate_text_embeddings(text: str, model_name: str = "text-embedding-004", dimensionality: Optional[int] = 768) -> List[List[float]]:
    """Embeds text with a pre-trained, foundational model."""
    model = TextEmbeddingModel.from_pretrained(model_name)
    
    # Ensure text is a string
    if isinstance(text, str):
        # Create a TextEmbeddingInput instance for the single piece of text
        inputs = [TextEmbeddingInput(text)]
        kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}
        embeddings = model.get_embeddings(inputs, **kwargs)
        # Assuming embeddings is a list of embedding objects, extract the values
        return [[value for value in embedding.values] for embedding in embeddings]
    else:
        raise ValueError("Input must be a string.")

In [157]:
import requests # type: ignore
from bs4 import BeautifulSoup # type: ignore

url = 'https://www.ratemyprofessors.com/professor/2175825'

page = requests.get(url)

soup = BeautifulSoup(page.text, 'html')

In [158]:
ratingsData = soup.find_all('ul', class_='cbdtns')
stars_data = ratingsData[0].find_all('div', class_='DObVa')[0].find_all('div')[0].find_all('div')
review_data = ratingsData[0].find_all('div', class_='gRjWel')[0]
professor_data = soup.find_all('div', class_='kFNvIp')[0].find_all('span')
subject_data = soup.find_all('div', class_='iLYGwn')[0].find_all('a')[0].find_all('b')[0]


In [159]:
stars = stars_data[1].text
review = review_data.text
professor = f'{professor_data[1].text} {professor_data[2].text}'
subject = subject_data.text.split()[0]

In [156]:
print('stars:', stars)
print('review:', review)
print('professor:', professor)
print('subject:', subject)

stars: 5.0
review: My favorite professor of all time - my second semester with Aaron (AKA Ronald). I could not recommend someone more for such a difficult course. He made Chemistry SO easy and understandable, and Piazza is the most amazing forum. Take 110B/112B regardless of your major because it is worth it! Best TAs and always responded to questions within minutes.
professor: Aaron Garner 
subject: Chemistry
