In [None]:
#————————————————————

# Name: Data Pre-Processing

# Purpose: Prepare the JSONL validation and training datasets for Fine-Tuning using pandas. This is the non-ai-generative approach to preparing these files.

# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch), Alex Dean (adean@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 22.01.2024
# Last Updated: 19.05.2024
# Python Version: 3.10.4


# If necessary, download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\sdsc\requirements.txt

#————————————————————

In [1]:
# Import required libraries
import os
import io
import time
from io import StringIO
import json
import numpy as np
from pathlib import Path
import pandas as pd
import requests
import random

In [6]:
# Import recipes csv

path_input = r"C:\Python\data\recipes.csv" #Change path if required
df = pd.read_csv(path_input , sep=',', on_bad_lines='skip', low_memory=False)

In [7]:
# Remove columns

df = df.drop(['id', 'contributor_id', 'submitted'], axis=1)

# Create subset of data with only vegan recipes
df = df[df['tags'].str.contains("vegan")]

# Remove double whitespaces from name
df["name"] = df["name"].str.replace(r'\s+', ' ', regex=True)

# Create subset of data
df_sample = df.sample(frac = 0.01)

# Creating subset of the subset with 70% values of original dataframe
df_recipe_training_set = df_sample.sample(frac = 0.7)

#Creating dataframe with rest of the 30% values
df_recipe_validation_set = df_sample.drop(df_recipe_training_set.index)


In [10]:
training_data = []
validation_data = []

system_message = "This is a recipe generator. The recipe generated should be output as a JSON object."
user_message_fix = "Create a flavourful recipe from a list of ingredients and provide the output as a JSON object"

def create_user_message(row):
    stripped = row.ingredients.apply(lambda x: str(x).strip("[]").replace("'", ""))
    return f"""{stripped.values[0]}"""

def create_assistant_message(row):
    json_record = row.to_json(orient='records', lines=True)
    return f"""{json_record}"""

def create_final_message(row):
    messages = []

    messages.append({"role": "system", "content": system_message})

    messages.append({"role": "user", "content": user_message_fix})

    user_message = create_user_message(row)
    messages.append({"role": "user", "content": user_message})

    assistant_message = create_assistant_message(row)
    messages.append({"role": "assistant", "content": assistant_message})

    return {"messages": messages}

create_final_message(df.iloc[:3])

{'messages': [{'role': 'system',
   'content': 'This is a recipe generator. The recipe generated should be output as a JSON object.'},
  {'role': 'user',
   'content': 'Create a flavourful recipe from a list of ingredients and provide the output as a JSON object'},
  {'role': 'user',
   'content': 'fennel seeds, green olives, ripe olives, garlic, peppercorn, orange rind, orange juice, red chile, extra virgin olive oil'},
  {'role': 'assistant',
   'content': '{"name":"aww marinated olives","minutes":15,"tags":"[\'15-minutes-or-less\', \'time-to-make\', \'course\', \'main-ingredient\', \'cuisine\', \'preparation\', \'occasion\', \'north-american\', \'appetizers\', \'fruit\', \'canadian\', \'dinner-party\', \'vegan\', \'vegetarian\', \'freezer\', \'dietary\', \'equipment\', \'number-of-servings\']","nutrition":"[380.7, 53.0, 7.0, 24.0, 6.0, 24.0, 6.0]","n_steps":4,"steps":"[\'toast the fennel seeds and lightly crush them\', \'place all the ingredients in a bowl , stir well\', \'cover and

In [11]:
for i in range(1,len(df_recipe_training_set) + 1,1):
    # change a = a.append(i) to
    output = create_final_message(df_recipe_training_set.iloc[i-1:i])   
    training_data.append(output)

In [12]:
for i in range(1,len(df_recipe_validation_set) + 1,1):
    # change a = a.append(i) to
    output = create_final_message(df_recipe_validation_set.iloc[i-1:i])   
    validation_data.append(output)

In [15]:
for example in training_data[50:51]:
    print(example)

{'messages': [{'role': 'system', 'content': 'This is a recipe generator. The recipe generated should be output as a JSON object.'}, {'role': 'user', 'content': 'Create a flavourful recipe from a list of ingredients and provide the output as a JSON object'}, {'role': 'user', 'content': 'cilantro leaves, blanched slivered almond, garlic cloves, salt, olive oil'}, {'role': 'assistant', 'content': '{"name":"vegan cilantro pesto","minutes":15,"tags":"[\'15-minutes-or-less\', \'time-to-make\', \'course\', \'preparation\', \'for-large-groups\', \'5-ingredients-or-less\', \'condiments-etc\', \'easy\', \'vegan\', \'vegetarian\', \'dietary\', \'number-of-servings\', \'3-steps-or-less\']","nutrition":"[174.0, 27.0, 1.0, 12.0, 4.0, 11.0, 0.0]","n_steps":3,"steps":"[\'combine cilantro , almonds , garlic and salt in food processor and process intil smooth\', \'with machine still running , add olive oil through feed tube and process into a smooth paste\', \'if not freezing , store tightly covered in 

In [16]:

def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)

In [17]:
output_path = r"C:\\Python\\azure-openai-lab\\data\\generated_output\\"

write_jsonl(training_data, output_path + "recipes-training-set.jsonl")
write_jsonl(validation_data, output_path + "recipes-validation-set.jsonl")