## Data Processing

In [8]:
# Imports
import pandas as pd

## Load and prep the data

In [9]:
# Read in file
df = pd.read_excel('raw/emotions_dataset.xlsx')

print(df.head())

             Emotion                                        Description  \
0           Abbiocco  Abbiocco is an Italian term for the drowsy fee...   
1  Abhimaan (अभिमान)  A Marathi term for a specific kind of pride th...   
2            Abhiman  A complex Indian emotion combining hurt pride,...   
3             Ablaze  Ablaze describes the state of being consumed b...   
4             Abrazo  Abrazo represents the emotional warmth and sec...   

   Example sentences  Example situations Language  Comments  
0                NaN                 NaN  Italian       NaN  
1                NaN                 NaN  Marathi       NaN  
2                NaN                 NaN   Indian       NaN  
3                NaN                 NaN  English       NaN  
4                NaN                 NaN  Spanish       NaN  


In [11]:
# Clean dataset (with ALL content)
df_clean = df[['Emotion', 'Description', 'Language']].copy()

# Clean dataset (with only LLM generated content)
# df_clean = df[['Emotion', 'Description', 'Checked', 'Language']].copy()
# df_clean = df_clean[df_clean['Checked'] == 'y']
# df_clean = df_clean.drop('Checked', axis=1)

# Add column combining first 2 columns together
df_clean['Full_description'] = df_clean['Emotion'] + ": " + df_clean['Description']

print(df_clean.head())

             Emotion                                        Description  \
0           Abbiocco  Abbiocco is an Italian term for the drowsy fee...   
1  Abhimaan (अभिमान)  A Marathi term for a specific kind of pride th...   
2            Abhiman  A complex Indian emotion combining hurt pride,...   
3             Ablaze  Ablaze describes the state of being consumed b...   
4             Abrazo  Abrazo represents the emotional warmth and sec...   

  Language                                   Full_description  
0  Italian  Abbiocco: Abbiocco is an Italian term for the ...  
1  Marathi  Abhimaan (अभिमान): A Marathi term for a specif...  
2   Indian  Abhiman: A complex Indian emotion combining hu...  
3  English  Ablaze: Ablaze describes the state of being co...  
4  Spanish  Abrazo: Abrazo represents the emotional warmth...  


## Create the embeddings

In [12]:
import os
from openai import OpenAI
from data_utils import get_embedding

api_key = os.environ.get('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

# Create embeddings
embeddings = [get_embedding(description, client) for description in df_clean['Full_description'].to_list()]

# Store embeddings in df to ensure original emotion can be retrieved via search
df_results = df_clean.copy()
df_results['Embedding'] = embeddings

In [13]:
print(df_results['Embedding'])

0      [-0.05360228568315506, -0.006268118973821402, ...
1      [-0.04037681967020035, 0.004841753281652927, -...
2      [-0.03313937783241272, -0.01248866319656372, -...
3      [-0.007300569210201502, 0.01511992234736681, -...
4      [-0.031230805441737175, 0.00743915606290102, -...
                             ...                        
479    [-0.05270390212535858, -0.01653508096933365, -...
480    [-0.024971861392259598, -0.005448988173156977,...
481    [-0.002951705828309059, -0.027292585000395775,...
482    [-0.009862719103693962, -0.007757913786917925,...
483    [0.009859306737780571, 0.017048237845301628, -...
Name: Embedding, Length: 484, dtype: object


## Store the embeddings

In [14]:
#Export to csv

filename = "embeddings_2025-06-25_3"

df_results.to_csv(path_or_buf="./processed/" + filename + ".csv")
df_results.to_pickle('./processed/' + filename + '.pkl')