In [1]:
import os
import pandas as pd
from ast import literal_eval

SUBJECT_FOLDER = "data/course_data_stripped.csv"
course_df = pd.read_csv(SUBJECT_FOLDER, sep=";")
course_df.head(2)

Unnamed: 0,emne_id,emne_navn,emne_navn_eng,studiepoeng,emne_utbytte,emne_innhold,faculty_code,contains_description
0,BE-002,Forkurs i bedriftsøkonomi for EMBA-studenter,Forkurs i bedriftsøkonomi for EMBA-studenter,0.0,"['Etter fullført emne skal studenten:', ['ha g...",['Emnet tilbys som et valgfritt forkurs til st...,HH,True
1,BE-111,Innføring i finansregnskap,Introduction to Financial Accounting,7.5,['Etter å ha fullført dette emnet skal student...,['Emnet gir en grunnleggende innføring i finan...,HH,True


In [None]:
# We'll do some initial preprocessing of the data.
# Since we're loading data from csv, we'll need to convert the string representation of lists back to lists.
course_df["emne_innhold"] = course_df["emne_innhold"].apply(literal_eval)
course_df["emne_utbytte"] = course_df["emne_utbytte"].apply(literal_eval)

# Some of the courses have neglected to provide a course description. We'll remove these courses from the dataset.
course_df["contains_description"] = course_df["emne_innhold"].apply(lambda x : len(x) > 0)
course_df = course_df[course_df["contains_description"]].copy()
course_df = course_df.drop(columns=["contains_description"])

# We'll need to convert the course content into strings to be able to embed them.
# The structure I decided to use here is somewhat arbitrary
def construct_full_course_description(row):
    return f"Course Code: {row['emne_id']}\n \
    Course Name: {row['emne_navn_eng']}\n\n \
    Course Outcomes: \n {row['emne_utbytte']}\n\n \
    Course Content: \n {row['emne_innhold']} \
    "

content_to_embed = course_df.apply(construct_full_course_description, axis=1).values

In [None]:
import voyageai
from tqdm.notebook import tqdm

# Remember to replace the placeholder string with your Voyage AI API key
VOYAGE_AI_API_KEY = "placeholder-string"
if VOYAGE_AI_API_KEY == "placeholder-string":
    raise ValueError("Please provide a Voyage AI API key")

voyageai.api_key = VOYAGE_AI_API_KEY
vo = voyageai.Client()

# Maximum batch size of 128 per request, we'll need to split the data into batches
BATCH_SIZE = 128
number_of_batches = len(content_to_embed) // BATCH_SIZE + 1
embeddings = []
total_tokens = 0
for i in tqdm(range(number_of_batches)):
    start_index = i * BATCH_SIZE
    end_index = min((i + 1) * BATCH_SIZE, len(content_to_embed))
    batch_content = content_to_embed[start_index:end_index]
    batch_embeddings = vo.embed(list(batch_content), model="voyage-3")
    embeddings.extend(batch_embeddings.embeddings)
    total_tokens += batch_embeddings.total_tokens

# Voyage AI provides a solid interface overview of how many tokens have been used on your account so far.
# At the time of writing they provide 2 million tokens for free.
print(f"Total tokens: {total_tokens} for a total of {len(embeddings)} embeddings")
course_df["embedding"] = embeddings

  0%|          | 0/9 [00:00<?, ?it/s]

Total tokens: 578492 for a total of 1144 embeddings


In [None]:
# Save the dataframe to disk in the form of a pickle file.
DATAFRAME_PATH = "data/course_data_embedded.pkl"
course_df.to_pickle(DATAFRAME_PATH)