## Get Vector Embeddings - Sub-File Level (Course Descriptions Version)

In [None]:

import os
import json
import openai
import openai
from openai import OpenAI
import re

def course_description_processing(
        input_file_path: str,
        output_file_path: str,
        delimiter: str,
        client,
        embedding_model_name):

    with open(input_file_path, 'r', encoding='utf8', errors = 'ignore') as f:
        file_contents = f.read()

    print('File opened successfully.')

    # Split content by delimiter
    chunks = file_contents.split(delimiter)

    print(f"Split content into {len(chunks)} chunks.")

    course_data = []

    for i in range(len(chunks)):

        if i % 10 == 0:
            print(f"Processing chunk {i}...")

        chunk_content = chunks[i].strip().replace('\n', ' ')

        if len(chunk_content) < 20:
            print(f"Skipping chunk {i} with length {len(chunk_content)} and content '{chunk_content}' due to inadequate length\n\n")
            continue

        # Get the name of the file from course department and number in characters 3-11 (DEPT ####):
        chunk_name = chunk_content[2:12].replace('.', '')

        # Identify invalid chunk names 
        if not re.match(r'^[A-Z]{4} \d{4}L?$', chunk_name):
            print(f"Skipping chunk {i} due to invalid name '{chunk_name} with content '{chunk_content}' '\n\n")
            continue      

        # Get embedding from OpenAI
        embedding = client.embeddings.create(
            input=chunk_content,
            model=embedding_model_name
        )
        
        # Store course data with name, content, and embedding
        course_entry = {
            'name': chunk_name,
            'content': chunk_content,
            'embedding': embedding.data[0].embedding
        }

        if i % 10 ==0:
            print(course_entry)

            
        course_data.append(course_entry)
    
    # identify duplicate course names
    course_names = [entry['name'] for entry in course_data]
    duplicates = set([name for name in course_names if course_names.count(name) > 1])
    if duplicates:
        print(f"Found duplicate course names: {duplicates}")
    else:
        print("No duplicate course names found.")

    # Save to JSON
    with open(output_file_path, "w") as f:
        json.dump(course_data, f, indent=2)
        
    print(f"Saved {len(course_data)} course entries to {output_file_path}")


openai.api_key = os.getenv("OPENAI_API_KEY")
embedding_model_name = os.getenv("EMBEDDING_MODEL_NAME")
client = OpenAI()

input_file_path = 'rag_corpus/ug_cat/staged/courses/course_descriptions.md'
output_file_path = 'rag_corpus/ug_cat/staged/courses/course_descriptions_with_embeddings_new.json'

course_description_processing(
    input_file_path=input_file_path,
    output_file_path=output_file_path,
    delimiter='\n\n',
    client=client,
    embedding_model_name=embedding_model_name)

In [None]:
# Load embeddings from the JSON file
with open(output_file_path, 'r') as f:
    course_descriptions_with_embeddings = json.load(f)

    for k,v in course_descriptions_with_embeddings.items():
        print(f"Key: {k}, \nDescription: {v[0][:50]}..., \nEmbedding: {v[1][:5]}...\n")

In [None]:
import json
import os
# Split the course descriptions file into smaller files
def split_course_descriptions(input_file_path, output_dir, chunk_size=100):
    with open(input_file_path, 'r') as f:
        course_descriptions = json.load(f)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for i in range(0, len(course_descriptions), chunk_size):
        chunk = course_descriptions[i:i + chunk_size]
        output_file_path = os.path.join(output_dir, f'course_descriptions_chunk_{i // chunk_size + 1}.json')
        
        with open(output_file_path, 'w') as out_f:
            json.dump(chunk, out_f, indent=2)
        
        print(f"Saved chunk {i // chunk_size + 1} with {len(chunk)} entries to {output_file_path}")

output_file_path = 'rag_corpus/ug_cat/staged/courses/course_descriptions_with_embeddings.json'

# Split the course descriptions into smaller files
output_dir = 'rag_corpus/ug_cat/staged/courses/'
split_course_descriptions(
    input_file_path=output_file_path,
    output_dir=output_dir,
    chunk_size=500)  # Adjust chunk size as needed