### Header
Input: Textual Data
Output: Same Data cleaned for synonyms
Grund: In German kitchen jargon there exist a lot of alternatives for words regarding food and tools depending on the region the recipe authour is from. Like to normalize that for the step by step guide.

In [3]:
import spacy
import pandas as pd
import numpy as np

In [4]:
# References
# https://www.youtube.com/watch?v=mCJ3wzjiNvA
# https://spacy.io/usage/vectors-similarity

In [5]:
# Similarity gets better with larger models. check
nlp = spacy.load('de_core_news_lg', disable=['parser', 'tagger', 'ner'])

In [6]:
# Pretrained word vectors by spacy
doc1 = nlp("würzen")
doc2 = nlp("abschmecken")
print(doc1.vector[:5],"...", "– Dimensions: ", len(doc1.vector))
print(doc2.vector[:5],"...", "– Dimensions: ", len(doc1.vector))
# Resulting Similarity
print("")
print("Similarity: ",doc1.similarity(doc2))

[ 0.24403 -3.4729  -2.9564   1.0566  -1.0356 ] ... – Dimensions:  300
[ 0.85765 -3.1901  -2.6978   1.3871  -2.6844 ] ... – Dimensions:  300

Similarity:  0.8424929889814099


In [7]:
doc3 = nlp("Hund Katze Maus Elefant Ratte Tiger Löwe")
sim_list = [(token1.text,token2.text,token1.similarity(token2)) for token2 in doc3 for token1 in doc3]
sim_list[:3]
for token in doc3:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)     # oov: hat ein token einen vector? has_vector: Ist ein Vector mit dem token assoziiert? --> Hä

Hund True 57.446507 False
Katze True 43.214478 False
Maus True 44.95886 False
Elefant True 24.972677 False
Ratte True 32.157112 False
Tiger True 34.42378 False
Löwe True 39.592545 False


In [8]:
df = pd.DataFrame(sim_list)
df.columns = ["Token 1", "Token 2", "Similarity"]
df

Unnamed: 0,Token 1,Token 2,Similarity
0,Hund,Hund,1.0
1,Katze,Hund,0.676083
2,Maus,Hund,0.381736
3,Elefant,Hund,0.441017
4,Ratte,Hund,0.516697
5,Tiger,Hund,0.353837
6,Löwe,Hund,0.361852
7,Hund,Katze,0.676083
8,Katze,Katze,1.0
9,Maus,Katze,0.563727


In [9]:
# Check similarities without adjusted training data
# Load the vocab of the recipes
# Ich weiss nicht ob der Kontext wichtig ist, falls ja ist das hier Bullshit
tokens_df = pd.read_pickle("/Users/Leonidas/Universität St.Gallen/STUD-Capstoneproject Tell 6 - General/Coding/cleaned_tokens")

In [10]:
tokens_df

Unnamed: 0,0
0,Eier
1,hart
2,kochen
3,pellen
4,Eierschneider
...,...
672,passt
673,frisches
674,Bauernbrot
675,kühles


In [11]:
print(len(tokens_df))
# Remove duplicates
tokens = list(set(tokens_df[0]))
print(len(tokens))

677
412


In [12]:
tokens = [ nlp(token) for token in tokens ]

In [15]:
word_vectors = [ (token, token.vector_norm) for token in tokens]
word_vectors

[(beiseite, 24.28634520186293),
 (Hirschsteaks, 0),
 (geschnittene, 26.488686540526533),
 (aufheben, 26.860388293900545),
 (Öl, 113.61053939833585),
 (verbliebene, 23.000963737933166),
 (folgende, 32.39294171182922),
 (geeignet).Den, 0),
 (Bemerkung, 26.229023394823532),
 (Pfeffer, 26.73242371411002),
 (Kochlöffel, 23.24062302476437),
 (braten, 33.83255901026115),
 (mega-ober-lecker, 0),
 (Eiern, 37.91087628456679),
 (kühles, 31.106295397734005),
 (Herd, 45.17810939521781),
 (schälen, 26.937706417849512),
 (No-Go, 28.780769523149008),
 (schlagen, 30.348483269078834),
 (Boden, 47.260135475137204),
 (Bohnenkraut, 20.976465396240023),
 (Eierschneider, 19.098527730370318),
 (einfetten, 23.606927003686426),
 (abgießen, 24.93608195328056),
 (Brühe, 32.50560622707079),
 (erhitzen, 29.41110881428712),
 (min, 65.792195305793),
 (unten, 40.79168923010776),
 (mehreren, 39.50335989503211),
 (entsprechend, 27.26076878536845),
 (Geputzten, 0),
 (jedes, 53.959113125784036),
 (Knoblauch, 22.3248924972

In [17]:
# Print the most similar for each token
trim = 20
for query in tokens[:trim]:
    print("Our word: ", query)
    closest = (None, 0)
    for token in tokens:
        if query.similarity(token) > closest[1] and query.similarity(token) != 1:
            closest = (token.text, query.similarity(token))
    print("Highest Similarity: ", closest)
    if closest[1] > 0.85:
            print("*")

Our word:  beiseite
Highest Similarity:  ('Seite', 0.4064021000077441)
Our word:  Hirschsteaks
Highest Similarity:  (None, 0)
Our word:  geschnittene
Highest Similarity:  ('geschnittenem', 0.8818902089922511)
*
Our word:  aufheben
Highest Similarity:  ('auslegen', 0.6648066081262295)
Our word:  Öl
Highest Similarity:  ('Olivenöl', 0.6375735472082416)
Our word:  verbliebene
Highest Similarity:  ('restlichen', 0.6252948351388405)
Our word:  folgende
Highest Similarity:  ('entsprechend', 0.4439625770297629)
Our word:  geeignet).Den
Highest Similarity:  (None, 0)
Our word:  Bemerkung
Highest Similarity:  ('Anmerkung', 0.7988807241006981)
Our word:  Pfeffer
Highest Similarity:  ('Knoblauch', 0.7029485770438941)
Our word:  Kochlöffel
Highest Similarity:  ('Teller', 0.6537627121909547)
Our word:  braten
Highest Similarity:  ('anbraten', 0.8924805616635546)
*
Our word:  mega-ober-lecker
Highest Similarity:  (None, 0)
Our word:  Eiern
Highest Similarity:  ('Eier', 0.8283275611232245)
Our word: 

In [22]:
# Similarity Matrix
trim = min(len(tokens), 200)
round_to = 2 # Nachkommastellen
mat = np.zeros((trim,trim))
print(mat)
for i,token1 in enumerate(tokens[:trim]):
    for k,token2 in enumerate(tokens[:trim]):
        mat[i,k] = round(token1.similarity(token2),round_to)
print(mat)
names = [ token.text for token in tokens[:trim]]
sims = pd.DataFrame(mat, columns = names, index = names)
# sims.index
sims

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[ 1.    0.    0.18 ...  0.2  -0.07 -0.05]
 [ 0.    1.    0.   ...  0.    0.    0.  ]
 [ 0.18  0.    1.   ...  0.2   0.2   0.13]
 ...
 [ 0.2   0.    0.2  ...  1.    0.04  0.08]
 [-0.07  0.    0.2  ...  0.04  1.    0.33]
 [-0.05  0.    0.13 ...  0.08  0.33  1.  ]]


Unnamed: 0,beiseite,Hirschsteaks,geschnittene,aufheben,Öl,verbliebene,folgende,geeignet).Den,Bemerkung,Pfeffer,...,glatt,hart,geben,fertig,Schüssel,Würfel,hineinschlagen,Prise,mindestens,Grad
beiseite,1.00,0.0,0.18,0.25,0.05,0.15,0.08,0.0,0.19,0.16,...,0.30,0.16,0.08,0.35,0.36,0.28,0.0,0.20,-0.07,-0.05
Hirschsteaks,0.00,1.0,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00
geschnittene,0.18,0.0,1.00,0.19,0.17,0.27,0.14,0.0,0.08,0.32,...,0.32,0.16,0.19,0.25,0.29,0.29,0.0,0.20,0.20,0.13
aufheben,0.25,0.0,0.19,1.00,0.15,0.44,0.18,0.0,0.18,0.09,...,0.05,0.06,0.50,0.15,0.15,0.18,0.0,0.04,0.30,0.04
Öl,0.05,0.0,0.17,0.15,1.00,0.18,-0.03,0.0,0.02,0.30,...,0.22,0.16,0.11,0.14,0.28,0.12,0.0,0.13,0.15,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Würfel,0.28,0.0,0.29,0.18,0.12,0.13,0.01,0.0,0.10,0.33,...,0.24,0.13,0.06,0.27,0.51,1.00,0.0,0.19,0.21,0.18
hineinschlagen,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,1.0,0.00,0.00,0.00
Prise,0.20,0.0,0.20,0.04,0.13,0.05,-0.11,0.0,0.22,0.42,...,0.10,0.11,0.14,0.03,0.38,0.19,0.0,1.00,0.04,0.08
mindestens,-0.07,0.0,0.20,0.30,0.15,0.38,0.26,0.0,0.05,0.05,...,0.05,0.12,0.19,0.08,0.16,0.21,0.0,0.04,1.00,0.33


In [26]:
# Get the n mst similar words
word = "Prise"
count = 5
sims.nlargest(count+1, word)[word].iloc[1:] # Cut irrelvant columns and the similarity to itself (always 1.00 thus index 1)

Muskat           0.45
Zitrone          0.44
Suppe            0.43
Pfeffer          0.42
Vanillezucker    0.41
Name: Prise, dtype: float64

In [137]:
# Adjust training data / word vectors with cooking vocab