## Data Processing

In [23]:
# Imports
import pandas as pd

## Load and prep the data

In [24]:
# Read in file
df = pd.read_excel('Raw/full_dictionary.xlsx')

print(df.head())

     Emotion                                        Description  \
0   Abbiocco  Abbiocco is an Italian term for the drowsy fee...   
1    Abhiman  A complex Indian emotion combining hurt pride,...   
2  Abhiyukta  Abhiyukta is a Sanskrit term that means 'being...   
3     Abiura  Abiura is an Italian term that captures the ac...   
4     Ablaze  Ablaze describes the state of being consumed b...   

                       Source Checked Comments        Type  Language Link  \
0           Claude 3.5 Sonnet       y      NaN  Experience   Italian  NaN   
1  The Book of Human Emotions       y      NaN  Experience    Indian  NaN   
2                        Grok       n      NaN         NaN  Sanskrit  NaN   
3                        Grok       n      NaN         NaN   Italian  NaN   
4           Claude 3.5 Sonnet       y      NaN  Experience   English  NaN   

  Example sentences  Example situations  
0               NaN                 NaN  
1               NaN                 NaN  
2       

In [25]:
# Clean dataset (with ALL content)
# df_clean = df[['Emotion', 'Description']].copy()

# Clean dataset (with only LLM generated content)
df_clean = df[['Emotion', 'Description', 'Checked', 'Language']].copy()
df_clean = df_clean[df_clean['Checked'] == 'y']
df_clean = df_clean.drop('Checked', axis=1)

# Add column combining first 2 columns together
df_clean['Full_description'] = df_clean['Emotion'] + ": " + df_clean['Description']

print(df_clean.head())

    Emotion                                        Description Language  \
0  Abbiocco  Abbiocco is an Italian term for the drowsy fee...  Italian   
1   Abhiman  A complex Indian emotion combining hurt pride,...   Indian   
4    Ablaze  Ablaze describes the state of being consumed b...  English   
5    Abrazo  Abrazo represents the emotional warmth and sec...  Spanish   
6    Acedia  Acedia is a Latin term describing a state of m...    Latin   

                                    Full_description  
0  Abbiocco: Abbiocco is an Italian term for the ...  
1  Abhiman: A complex Indian emotion combining hu...  
4  Ablaze: Ablaze describes the state of being co...  
5  Abrazo: Abrazo represents the emotional warmth...  
6  Acedia: Acedia is a Latin term describing a st...  


## Create the embeddings

In [26]:
from openai import OpenAI
from data_utils import get_embedding

client = OpenAI(api_key = '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')

# Create embeddings
embeddings = [get_embedding(description, client) for description in df_clean['Full_description'].to_list()]

# Store embeddings in df to ensure original emotion can be retrieved via search
df_results = df_clean.copy()
df_results['Embedding'] = embeddings

In [27]:
# Oops, need to take the actual embedding out of the object for each row
# df_results['Embedding_raw'] = [df_results['Embedding'][i].data[0].embedding for i in range(0,len(df_results['Embedding']))]

print(df_results['Embedding'])

0      [-0.05363987758755684, -0.006282945163547993, ...
1      [-0.03313937783241272, -0.01248866319656372, -...
4      [-0.007300569210201502, 0.01511992234736681, -...
5      [-0.031230805441737175, 0.00743915606290102, -...
6      [-0.05120302364230156, 0.019050151109695435, -...
                             ...                        
442    [-0.0002702227793633938, -0.000597009318880736...
443    [-0.05270999297499657, -0.016541967168450356, ...
444    [-0.02500520832836628, -0.005418649408966303, ...
445    [-0.0029342544730752707, -0.027296055108308792...
449    [0.009859306737780571, 0.017048237845301628, -...
Name: Embedding, Length: 380, dtype: object


## Store the embeddings

In [28]:
#Export to csv

filename = "embeddings"

df_results.to_csv(path_or_buf="./Processed/" + filename + ".csv")
df_results.to_pickle('./Processed/' + filename + '.pkl')