## Data Processing

In [None]:
# Imports
import pandas as pd

## Load and prep the data

In [None]:
# Read in file
df = pd.read_excel('raw/full_dictionary.xlsx')

print(df.head())

In [None]:
# Clean dataset (with ALL content)
# df_clean = df[['Emotion', 'Description']].copy()

# Clean dataset (with only LLM generated content)
df_clean = df[['Emotion', 'Description', 'Checked', 'Language']].copy()
df_clean = df_clean[df_clean['Checked'] == 'y']
df_clean = df_clean.drop('Checked', axis=1)

# Add column combining first 2 columns together
df_clean['Full_description'] = df_clean['Emotion'] + ": " + df_clean['Description']

print(df_clean.head())

## Create the embeddings

In [None]:
import os
from openai import OpenAI
from data_utils import get_embedding

api_key = os.environ.get('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

# Create embeddings
embeddings = [get_embedding(description, client) for description in df_clean['Full_description'].to_list()]

# Store embeddings in df to ensure original emotion can be retrieved via search
df_results = df_clean.copy()
df_results['Embedding'] = embeddings

In [None]:
# Oops, need to take the actual embedding out of the object for each row
# df_results['Embedding_raw'] = [df_results['Embedding'][i].data[0].embedding for i in range(0,len(df_results['Embedding']))]

print(df_results['Embedding'])

## Store the embeddings

In [None]:
#Export to csv

filename = "embeddings"

df_results.to_csv(path_or_buf="./processed/" + filename + ".csv")
df_results.to_pickle('./processed/' + filename + '.pkl')