In [2]:
# import the necessary packages
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from groq import Groq
from webscraper import webscraper as ws
import requests
import os

if load_dotenv():
    print("Loaded environment variables")
else:
    print("Failed to load environment variables")

  from tqdm.autonotebook import tqdm


Loaded environment variables


In [3]:
# initialize the APIs
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
groq = Groq(api_key=os.getenv("GROQ_API_KEY"))

# global variables
global_val = 1
system_prompt = """You are a rate my professor agent to help students find their professors for their classes.
                   Using a dictionary of information for a professor which will be provided, create a really 
                   detailed and generalized paragraph summary containing all of the information in the
                   dictionary."""

In [4]:
try:
    pc.create_index("rmp-index", dimension=1024, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))
    print("Index created")
except:
    print("Index already exists")

Index already exists


In [11]:
def getEmbeddings(model_id: str, hf_token: str, data: list[str]):
    api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
    headers = {"Authorization": f"Bearer {hf_token}"}

    response = requests.post(api_url, headers=headers, json={"inputs": data, "options":{"wait_for_model":True}})
    return response.json()

model_id = "intfloat/multilingual-e5-large"
hf_token = os.getenv("HUGGINGFACE_API_KEY")

processed_data = []
for i in range(global_val, global_val + 50):
    print("--------------------------", "Iteration:", i, "--------------------------")
    rmp_url = f"https://www.ratemyprofessors.com/professor/{i}"
    data = ws.scrape_rmp_link(rmp_url)

    if (data['prof_name'] == "N/A"):
        print("Professor not found", "\n")
        continue

    response = groq.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": str(data)
            }
        ],
        model="llama3-8b-8192",
    )

    summary = response.choices[0].message.content

    embeddings = getEmbeddings(model_id, hf_token, summary)

    processed_data.append(
        {
            "values": embeddings,
            "id": data['prof_name'],
            "metadata": {
                "summary": summary,
                "department": data['prof_dept'],
                "university": data['university_name'],
                "rating": data['rating'],
                "top_tags": data['top_tags'],
                "difficulty": data['difficulty'],
                "classes_taught": data['classes_taught'],
            }
        }
    )

global_val += 50

if processed_data == []:
    exit()

# Insert the embeddings into the Pinecone index
index = pc.Index("rmp-index")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="professors",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

# Print index statistics
print(index.describe_index_stats())



-------------------------- Iteration: 301 --------------------------

[Professor Info]

Overall rating - N/A
Number of ratings - N/A
Professor name - N/A
Professor department - N/A
University name - N/A
Level of difficulty - N/A
Would take again - N/A
Classes taught - ['N/A']
Top tags - ['N/A']
Recent comments - ['N/A']

Professor not found 

-------------------------- Iteration: 302 --------------------------

[Professor Info]

Overall rating - 1.9/5
Number of ratings - 37 ratings
Professor name - Joe Adamski 
Professor department - Computer Science department
University name - Grand Valley State University
Level of difficulty - 3.7/5
Would take again - N/A
Classes taught - [' SCI000', ' CS350', ' CS', ' CS450', ' ALL', ' CS353', ' ADVISOR', ' CS253']
Top tags - ['N/A']
Recent comments - ['.', "This man hated teaching and wasn't afraid to tell people.  He only cared about publishing.  Thank the Lord he's gone!", 'He was OK, not close to the best I ever had...', 'This man was awesome! 