## Create four Dataset for labeling the recipes
####  14. November 2020
### Input
Chefkoch Instructions data csv-file
### Output
8 JSONs (2 per person) <br>
batch1: 10 overlap, 50 unique <br>
batch2: 20, overlap, 70 unique

In [1]:
# Coco Path
# path = r"C:\Users\CocoL\Universität St.Gallen\STUD-Capstoneproject Tell 6 - General\"
# Jona Path
# path = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/"
# Giovanni Path
path = r"/Users/jonathanebner/Universität St.Gallen/STUD-Capstoneproject Tell 6 - General"
# Leo Path
#ath = r"/Users/Leonidas/Universität St.Gallen/STUD-Capstoneproject Tell 6 - General/"

# Import packages
import pandas as pd
import numpy as np
import spacy

In [2]:
# Load data and seperate instructions at @ as defined
data = pd.read_csv(path+"02-Coding/01-Data/instructions.csv", sep="@", encoding="utf-16", header=None, error_bad_lines=False)
print("Number of imported recipes: ",len(data))

Number of imported recipes:  12190


In [3]:
# Name column text to have the keys named text later
data.columns = ["text"]

In [4]:
# Initiate nlp model
disabled = ['parser', 'tagger', 'ner']
nlp = spacy.load('de_core_news_lg', disable=disabled)

# Create a column with tokenized instructions using spacy
data["Tokenized"] = data["text"].apply(nlp) 
# Create a column with the amount of tokens per recipe
data["Length"] = data["Tokenized"].apply(len)
data.head()

Unnamed: 0,text,Tokenized,Length
0,Die Eier hart kochen. Dann pellen und mit eine...,"(Die, Eier, hart, kochen, ., Dann, pellen, und...",341
1,Vorab folgende Bemerkung: Alle Mengen sind Cir...,"(Vorab, folgende, Bemerkung, :, Alle, Mengen, ...",155
2,"Die Kirschen abtropfen lassen, dabei den Saft ...","(Die, Kirschen, abtropfen, lassen, ,, dabei, d...",133
3,"Den Spargel säubern, die holzigen Enden abschn...","(Den, Spargel, säubern, ,, die, holzigen, Ende...",137
4,Kohlrabi schälen und klein würfeln. Mit der Br...,"(Kohlrabi, schälen, und, klein, würfeln, ., Mi...",89


In [5]:
# select all recipes with a token length of smaller 210
data_smaller210 = data[data["Length"] < 210]
data_smaller210.head()

Unnamed: 0,text,Tokenized,Length
1,Vorab folgende Bemerkung: Alle Mengen sind Cir...,"(Vorab, folgende, Bemerkung, :, Alle, Mengen, ...",155
2,"Die Kirschen abtropfen lassen, dabei den Saft ...","(Die, Kirschen, abtropfen, lassen, ,, dabei, d...",133
3,"Den Spargel säubern, die holzigen Enden abschn...","(Den, Spargel, säubern, ,, die, holzigen, Ende...",137
4,Kohlrabi schälen und klein würfeln. Mit der Br...,"(Kohlrabi, schälen, und, klein, würfeln, ., Mi...",89
5,"Aus dem Mehl, der Butter, dem Ei und etwas Sal...","(Aus, dem, Mehl, ,, der, Butter, ,, dem, Ei, u...",152


In [7]:
# delete duplicates in text
data_smaller210_nodupl = data_smaller210.drop_duplicates(subset='text', keep="first")

In [34]:
print("All:", len(data))
print("Length 210:", len(data_smaller210))
print("No duplicates:", len(data_smaller210_nodupl))

All: 12190
Length 210: 10650
No duplicates: 3761


In [46]:
# shuffle all recipes
data_shuffle = data_smaller210_nodupl.sample(frac=1).reset_index(drop=True).iloc[:,0]
data_final = data_shuffle.to_frame()
data_drop = data_final.copy()
data_final.head()

Unnamed: 0,text
0,Fleisch am besten eine Stunde vor dem Braten a...
1,Zwiebeln schälen und fein würfeln. Knoblauch s...
2,Eine runde Quicheform mit dem Blätterteig ausl...
3,Die Zwiebeln hacken und in reichlich kräftigem...
4,"Blätterteig auftauen, die einzelnen Scheiben a..."


In [68]:
# create overlap of 30
overlap = data_drop.sample(30)
overlap.to_json(path+"02-Coding/01-Data/final_labeling/overlap.json",orient='records',force_ascii=False)
overlap_batch1 = overlap.iloc[:10]
overlap_batch2 = overlap[10:]
data_drop = data_drop.drop(list(overlap.index))
print("Overlap:", len(overlap))
print("Rest:", len(data_drop))
overlap.head()

Overlap: 30
Rest: 3191


Unnamed: 0,text
2396,Backofen auf 180° vorheizen.Tomaten enthäuten ...
1672,"Radicchio waschen, rüsten, in mundgerechte Stü..."
2675,Die Tortellini nach Packungsanleitung ca. 1 Mi...
2385,"Die Zwiebel schälen, Zucchini, Möhren und Papr..."
2931,Die Nudeln in Salzwasser nach Packungsanweisun...


In [40]:
amount_recipes_batch1 = 50
amount_recipes_batch2 = 70

In [48]:
# create batch1 and batch2

batch1_jona = data_drop.sample(amount_recipes_batch1)
data_drop = data_drop.drop(list(batch1_jona.index))
batch2_jona = data_drop.sample(amount_recipes_batch2)
data_drop = data_drop.drop(list(batch2_jona.index))
print("Rest:", len(data_drop))

batch1_leo = data_drop.sample(amount_recipes_batch1)
data_drop = data_drop.drop(list(batch1_leo.index))
batch2_leo = data_drop.sample(amount_recipes_batch2)
data_drop = data_drop.drop(list(batch2_leo.index))
print("Rest:", len(data_drop))

batch1_coco = data_drop.sample(amount_recipes_batch1)
data_drop = data_drop.drop(list(batch1_coco.index))
batch2_coco = data_drop.sample(amount_recipes_batch2)
data_drop = data_drop.drop(list(batch2_coco.index))
print("Rest:", len(data_drop))

batch1_jonathan = data_drop.sample(amount_recipes_batch1)
data_drop = data_drop.drop(list(batch1_jonathan.index))
batch2_jonathan = data_drop.sample(amount_recipes_batch2)
data_drop = data_drop.drop(list(batch2_jonathan.index))
print("Rest:", len(data_drop))

Rest: 3611
Rest: 3491
Rest: 3371
Rest: 3251


In [59]:
# mix batches with overlap

batch1_jona_total = pd.concat([batch1_jona, overlap_batch1]).sample(frac = 1)
batch2_jona_total = pd.concat([batch2_jona, overlap_batch2]).sample(frac = 1)

batch1_leo_total = pd.concat([batch1_leo, overlap_batch1]).sample(frac = 1)
batch2_leo_total = pd.concat([batch2_leo, overlap_batch2]).sample(frac = 1)

batch1_coco_total = pd.concat([batch1_coco, overlap_batch1]).sample(frac = 1)
batch2_coco_total = pd.concat([batch2_coco, overlap_batch2]).sample(frac = 1)

batch1_jonathan_total = pd.concat([batch1_jonathan, overlap_batch1]).sample(frac = 1)
batch2_jonathan_total = pd.concat([batch2_jonathan, overlap_batch2]).sample(frac = 1)

In [67]:
# df to json

batch1_jona_total.to_json(path+"02-Coding/01-Data/final_labeling/batch1_jona.json",orient='records',force_ascii=False)
batch2_jona_total.to_json(path+"02-Coding/01-Data/final_labeling/batch2_jona.json",orient='records',force_ascii=False)

batch1_leo_total.to_json(path+"02-Coding/01-Data/final_labeling/batch1_leo.json",orient='records',force_ascii=False)
batch2_leo_total.to_json(path+"02-Coding/01-Data/final_labeling/batch2_leo.json",orient='records',force_ascii=False)

batch1_coco_total.to_json(path+"02-Coding/01-Data/final_labeling/batch1_coco.json",orient='records',force_ascii=False)
batch2_coco_total.to_json(path+"02-Coding/01-Data/final_labeling/batch2_coco.json",orient='records',force_ascii=False)

batch1_jonathan_total.to_json(path+"02-Coding/01-Data/final_labeling/batch1_jonathan.json",orient='records',force_ascii=False)
batch2_jonathan_total.to_json(path+"02-Coding/01-Data/final_labeling/batch2_jonathan.json",orient='records',force_ascii=False)