In [None]:
import openai
import json
import pandas as pd
import pathlib
import base64

In [None]:
path = pathlib.Path().resolve()

dialogues_file_path = path/"data/csv/0_The Gommage.csv"
audio_file_path = path/"data/audio/exported/0_The Gommage.mp3"

In [None]:
key = open(path/"data/open_ai_token.txt", "r").read()
client = openai.OpenAI(api_key = key)

In [None]:
df = pd.read_csv(dialogues_file_path)
df.sort_values(by=["chapter_index", "dialogue_index", "line_index"], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
df["id"] = df["dialogue_index"].astype(str) +"_"+ df["line_index"].astype(str)

df["outc"] = df["id"] + " | " + df["speaker"] + ": " + df["line"]
df["outc"].head()

In [None]:
target_emotions = ["anger", "sadness", "fear", "happiness", "joy", "determined", "sarcasm"]

system_message = f"""
You are an audio emotion recognition assistant. Evaluate the likelihood of the emotions in the audio dialogue.
The estimate should be between 0 and 1, and the total should add up to 1.
Only classify the following emotions: {', '.join(target_emotions)}.

You will have the transcript of the dialogue. Use the row index as key when returning the estimate for the voice line.

When you reply, do not add any other text. Just reply with a JSON formatted string.
"""

In [None]:
audio_b64 = base64.b64encode(open(audio_file_path, "rb").read()).decode("utf-8")
dialogues_text = "\n".join(df["outc"].to_list())

In [None]:
# https://platform.openai.com/docs/api-reference/chat/create

response = client.chat.completions.create(
  model="gpt-audio",
  temperature=0.1,
  messages=[
    {
      "role": "system",
      "content": system_message
    },
    {
      "role": "user",
      "content": [
        # {
        #   "type": "file",
        #   "file": {
        #     "file_id": file.id,
        #   }
        # },
        {
          "type": "text",
          "text": dialogues_text
        },
        {
          "type": "input_audio",
          "input_audio": {
            "data": audio_b64,
            "format": "mp3"
          }
        }
      ]
    }
  ]
)

In [None]:
out_content = json.loads(response.choices[0].message.content)
out_content