In [38]:
from pinecone import Pinecone, ServerlessSpec
import os 
import json
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()


True

In [40]:
#initialize openai client
client = OpenAI()

#initialize pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [107]:

def scrapeReviews(profLink):
    response = requests.get(profLink)
    soup = BeautifulSoup(response.content, 'html.parser')

    profReviews = []
    studentReviews = []
    professor_fname = soup.find('div', class_='NameTitle__Name-dowf0z-0 cfjPUG').text.strip()
    professor_lname = soup.find('span', class_='NameTitle__LastNameWrapper-dowf0z-2 glXOHH').text.strip()
    prof_name = professor_fname + " " + professor_lname
    rating = soup.find('div', class_='RatingValue__Numerator-qw8sqy-2 liyUjw').text.strip()

    for review in soup.find_all('div', class_='FeedbackItem__StyledFeedbackItem-uof32n-0 dTFbKx'):
        difficulty = review.find('div', class_='FeedbackItem__FeedbackNumber-uof32n-1 kkESWs').text.strip()

    for review in soup.find_all('div', class_='NameTitle__Title-dowf0z-1 iLYGwn'):
        subject = review.find('a', class_='TeacherDepartment__StyledDepartmentLink-fl79e8-0 iMmVHb').text.strip()
   
    for review in soup.find_all('div', class_='Rating__RatingBody-sc-1rhvpxz-0 dGrvXb'):
        review_text = review.find('div', class_='Comments__StyledComments-dzzyvm-0 gRjWel').text.strip()
        studentReviews.append(review_text)

    
    formatted_reviews = {
        "professor": prof_name,
        "subject-department": subject,
        "rating": rating,
        "difficulty": difficulty,
        "student_reviews": studentReviews
    }

    profReviews = [formatted_reviews]
    
    return profReviews


In [117]:
x = scrapeReviews("https://www.ratemyprofessors.com/professor/2692312")
x

[{'professor': 'Christina Chung',
  'subject-department': 'Gender, Women, & Sexuality Studies department',
  'rating': '5',
  'difficulty': '2.5',
  'student_reviews': ["Christina is absolutely amazing, and one of the best teachers I've had at UW! She is knowledgable, funny, kind, and made our classes a safe and positive place. There is not too much work other than a final group project, but there are a fair amount of readings. 10/10 recommend!",
   'Professor Chung is amazing! I loved having her for GWSS 200 so I had to take her other course. There are weekly discussions and lectures. In-Class days are mandatory for discussion. There is 1 group project. She breaks down terms and really wants everyone to dive deep into the work.']}]

In [108]:

def addReviews(reviews, json_file='reviews.json'):
    try:
        # Try to read the existing data
        with open(json_file, 'r') as file:
            data = json.load(file)
    except FileNotFoundError:
        # If the file doesn't exist, initialize with an empty reviews list
        data = {'reviews': []}
    except json.JSONDecodeError:
        # Handle case where file is empty or corrupted
        data = {'reviews': []}
    
    # Append new reviews to the data
    data['reviews'].extend(reviews)
    
    # Write updated data back to the file
    with open(json_file, 'w') as file:
        json.dump(data, file, indent=4)

In [109]:
x = scrapeReviews('https://www.ratemyprofessors.com/professor/936759')
addReviews(x)

In [110]:
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Fernando Fernandez',
  'subject-department': 'Psychology department',
  'rating': '4.5',
  'difficulty': '1.5',
  'student_reviews': ['He really enjoys what he teaches.  Awesome teacher.  Lets you use 8 1/2 X 11 sheet of notes for exams.  Learned alot in his class.',
   'Super easy class! Awesome professor. I loved his class.',
   'great professor!  Definitely take him for a psychology class.  He is easy to understand.  Make sure to read the book as you have to take a midterm and a cumulative final.  Pay attention to the chapter objectives that he gives you and remember to participate by sending in videos and commenting on blackboard when something is posted.',
   'Best professor! Cares about his students! Class was fun and interesting. He really tries to connect with students, and assigns were so much fun! Lets you use a cheat sheet for exams. He is awesome!',
   "Such a great professor. Really wants you to succeed. Extremely helpful and willing to go the extra mile. T

In [111]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['student_reviews'], 
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding, 
        "id": review ["professor"], 
        "metadata": {
            "subject-department": review["subject-department"],
            "rating": review["rating"],
            "difficulty": review["difficulty"],
            "student_reviews": review["student_reviews"]
        }
    })

In [112]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data, 
    namespace="ns1"
)

{'upserted_count': 1}

In [113]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 16}},
 'total_vector_count': 16}