In [1]:
dataset_name = "EmoEvent (Raw)"     # See dataset_config.py for dataset options
subset = None

include_unlabeled_labels = False          # True: Includes "others" or similar labels for records that did not fit specific classes.

llm = "ChatGPT 4o-Mini"                   # See LLM_config.py for LLM options

In [2]:
from config_files import dataset_config
from config_files import LLM_config

dataset_metadata = dataset_config.dataset[dataset_name]
llm_metadata = LLM_config.model[llm]

In [3]:
import pandas as pd

dataset = pd.read_csv(dataset_metadata['abspath'], delimiter="\t")

dataset.drop(columns = dataset_metadata["unused_columns"], inplace=True)
dataset.rename(columns = dataset_metadata["remap_columns"], inplace=True)

# Remove "Others"
if not include_unlabeled_labels:
    dataset.drop(dataset[dataset['labels'] == dataset_metadata["unlabeled_label"]].index, inplace=True)

dataset

Unnamed: 0,text,labels
0,Something to keep in mind... When situations l...,disgust
1,I'm really sorry about a whole history of 850 ...,sadness
2,The kid in me is sobbing at all the Hunchback ...,sadness
4,It’s World Book Day and a great day to visit t...,joy
5,Celebrate HASHTAG with a pop quiz. How many ch...,joy
...,...,...
7295,Happy HASHTAG. Bars with books are the best. ...,joy
7297,Is there a more iconic song to a football fan ...,joy
7298,In support of HASHTAG and all the kids (and ad...,sadness
7301,Your 2018/19 HASHTAG champions....⚽⚽⚽. Booooo...,joy


In [4]:
import os
llm_directory = llm_metadata['id'].replace(":", "_")

system_content = f"\"You are a highly accurate annotator, labeling records from a dataset of <text_type>s based on their <label_type>. Available labels are [<label_list>]. ONLY label using the single most-representative <label_type>.\""
system_content = system_content.replace('<text_type>', dataset_metadata['text_source'])
system_content = system_content.replace('<label_type>', dataset_metadata['label_type'])
system_content = system_content.replace('<label_list>', ", ".join(dataset_metadata['label_list']))
print(system_content)

training_example_template = f'{{"messages": [{{"role": "system", "content": {system_content}}}, {{"role": "user", "content": "<text>"}}, {{"role": "assistant", "content": "<labels>"}}]}}\n'

print("\nExample: ")
text = dataset.loc[0, 'text'].replace('"', '\\"')
labels = dataset.loc[0, 'labels']
print(training_example_template.replace('<text>', text).replace('<labels>', labels))

try:
    file = open(f'./{llm_directory}/{dataset_metadata['id']}.jsonl', 'w', encoding='utf-8')
except OSError:
    os.makedirs(f'./{llm_directory}/')
    file = open(f'./{llm_directory}/{dataset_metadata['id']}.jsonl', 'w', encoding='utf-8')    
    
finally:
    for index, row in dataset.iterrows():
        text = row['text'].replace('"', '\\"')
        labels = row['labels']
        newline = training_example_template.replace('<text>', text).replace('<labels>', labels)
        file.write(newline)
    file.close()

"You are a highly accurate annotator, labeling records from a dataset of tweets based on their emotion. Available labels are [anger, disgust, fear, joy, sadness, surprise]. ONLY label using the single most-representative emotion."

Example: 
{"messages": [{"role": "system", "content": "You are a highly accurate annotator, labeling records from a dataset of tweets based on their emotion. Available labels are [anger, disgust, fear, joy, sadness, surprise]. ONLY label using the single most-representative emotion."}, {"role": "user", "content": "Something to keep in mind... When situations like HASHTAG occursome lowlifes set up fake donation accounts and pocket the money so IF you are going to donate... Do some PROPER research first!  URL"}, {"role": "assistant", "content": "disgust"}]}



In [12]:
from openai import OpenAI
client = OpenAI()

training_file_directory = f'./{llm_metadata['id']}/{dataset_metadata['id']}.jsonl'

response = client.files.create(
    file=open(training_file_directory, "rb"),
    purpose="fine-tune"
)
file_id = response.id

print(response)

FileObject(id='file-FbqofjQh0S4SeO0xoS1jfexr', bytes=1977655, created_at=1731781093, filename='emoevent_en_tokenized.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


In [13]:
response = client.fine_tuning.jobs.create(
    training_file=file_id,
    model=llm_metadata['id']
)
job_id = response.id

print(response)

FineTuningJob(id='ftjob-FlQa9hzZtBhhtqrqm9OqAoDa', created_at=1731781096, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-7nchscNZ9gVB9ZkzvlTsGpWX', result_files=[], seed=79215339, status='validating_files', trained_tokens=None, training_file='file-FbqofjQh0S4SeO0xoS1jfexr', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

In [15]:
client.fine_tuning.jobs.retrieve('ftjob-FlQa9hzZtBhhtqrqm9OqAoDa')

FineTuningJob(id='ftjob-FlQa9hzZtBhhtqrqm9OqAoDa', created_at=1731781096, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=7, learning_rate_multiplier=1.8), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-7nchscNZ9gVB9ZkzvlTsGpWX', result_files=[], seed=79215339, status='running', trained_tokens=None, training_file='file-FbqofjQh0S4SeO0xoS1jfexr', validation_file=None, estimated_finish=1731783443, integrations=[], user_provided_suffix=None)