## Get Vector Embeddings - one embedding in its own file for each content file  (CoEng departments and programs version)

In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
from time import sleep
import chardet
import openai
from openai import OpenAI
import json

load_dotenv()
embedding_model_name = os.getenv("EMBEDDING_MODEL_NAME") # e.g., "text-embedding-ada-002", but TODO consider trying new embedding models
data_base_path = '../processing/ug_cat/fc_md_claude_cleaned/'

# Detect encoding and read csv
with open('rag_corpus/ug_cat_metadata.csv', 'rb') as f:
    result = chardet.detect(f.read(10000))  # Read first 10KB to detect
    encoding = result['encoding']
    print(f"Detected encoding: {encoding}")
df = pd.read_csv('rag_corpus/ug_cat_metadata.csv', encoding=encoding)

# Identify unique catalog years
catalog_years = df['catalog_year'].unique()
print (f'Found {len(catalog_years)} unique catalog years: {catalog_years}')
if len(catalog_years) != 5:
    print(f'Warning: Expected 5 unique catalog years, found {len(catalog_years)}. Please check the metadata file.')


openai.api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI()

for year in catalog_years:

    df_year = df[df['catalog_year'] == year]
    print(f'Processing catalog year: {year}')

    year_base_path = data_base_path + year + '/'

    for index, row in df_year.iterrows():
        # Construct the input file path for each row
        content_file_path = os.path.join(year_base_path, row['file_name'] + '.md')
        
        embedding_file_path = os.path.join(year_base_path, 'embeddings', embedding_model_name, row['file_name'] + '.json')

        # If output file already exists, skip
        if os.path.exists(embedding_file_path):
            print(f"File {row['file_name']} already exists, skipping.")
            continue

        # Read the input file content
        with open(content_file_path, 'r', encoding='utf-8') as f:
            input_content = f.read()
        
        # Get embedding from OpenAI
        response = client.embeddings.create(
            input=input_content,
            model=embedding_model_name
        )
        
        # Store embedding
        embedding = response.data[0].embedding

        # Save embedding file
        os.makedirs(os.path.dirname(embedding_file_path), exist_ok=True)
        with open(embedding_file_path, 'w', encoding='utf-8') as f:
            json.dump({
                'embedding_model': embedding_model_name,
                'embedding': embedding
            }, f)

        print(f"Processed and saved embedding for {row['file_name']}")

        sleep(3)


Detected encoding: ISO-8859-1
Found 5 unique catalog years: ['2021-2022' '2022-2023' '2023-2024' '2024-2025' '2025-2026']
Processing catalog year: 2021-2022
File BSCE Civil Engineering already exists, skipping.
File BSCE Civil Engineering, Energy Infrastructure Concentration already exists, skipping.
File BSCE Civil Engineering, Land Development Engineering Concentration already exists, skipping.
File BSCPE Computer Engineering already exists, skipping.
File BSCPE Computer Engineering, Machine Learning Concentration already exists, skipping.
File BS Dual Degree Physics and Computer Engineering already exists, skipping.
File BSEE Electrical Engineering already exists, skipping.
File BSEE Electrical Engineering, Machine Learning Concentration already exists, skipping.
File BSEE Electrical Engineering, Power and Energy Systems Concentration already exists, skipping.
File BS Dual Degree Physics and Electrical Engineering already exists, skipping.
File BSME Mechanical Engineering already ex

## Get Vector Embeddings - One Embedding File for Many Content Files (unused)

In [None]:
import os
import json
import openai
import openai
from openai import OpenAI

def get_embeddings(
        directory: str, 
        output_file_name: str,
        client,
        embedding_model_name):
    results = {}
    
    # Get all markdown files
    md_files = [f for f in os.listdir(directory) if f.endswith(".md")]
    
    for file_path in md_files:
        # Read file content
        with open(f'{directory}/{file_path}', "r") as f:
            content = f.read()
        
        
    
    # Save to JSON
    with open(f'{directory}/{output_file_name}', "w") as f:
        json.dump(results, f)
    
    print(f"Saved {len(results)} embeddings to {output_file_name}")

    # Load env file
from dotenv import load_dotenv
load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")
embedding_model_name = os.getenv("EMBEDDING_MODEL_NAME")
client = OpenAI()

catalogs_path = "rag_corpus/ug_cat/"
catalog_years = [
    "21_22", 
    "22_23", 
    "23_24", 
    "24_25",
    "25_26",
    ]

for catalog_year in catalog_years:
    catalog_path = os.path.join(catalogs_path, catalog_year, 'md_claude_from_fc')
    if not os.path.exists(catalog_path):
        print(f"Catalog path {catalog_path} does not exist.")
        continue
    
    # Get embeddings for each catalog
    get_embeddings(
        catalog_path,
        output_file_name = f"{embedding_model_name}.json",
        client = client,
        embedding_model_name = embedding_model_name)