In [1]:
"""
Input: Textual Data
Output: Same Data cleaned for synonyms
Grund: In German kitchen jargon there exist a lot of alternatives for words regarding food and tools depending on the region the recipe authour is from. Like to normalize that for the step by step guide.
"""

'\nInput: Textual Data\nOutput: Same Data cleaned for synonyms\nGrund: In German kitchen jargon there exist a lot of alternatives for words regarding food and tools depending on the region the recipe authour is from. Like to normalize that for the step by step guide.\n'

In [105]:
import spacy
import pandas as pd
import numpy as np

In [3]:
# References
# https://www.youtube.com/watch?v=mCJ3wzjiNvA
# https://spacy.io/usage/vectors-similarity

In [4]:
# Similarity gets better with larger models. check
nlp = spacy.load('de_core_news_lg', disable=['parser', 'tagger', 'ner'])

In [40]:
# Pretrained word vectors by spacy
doc1 = nlp("würzen")
doc2 = nlp("abschmecken")
print(doc1.vector[:5],"...", "– Dimensions: ", len(doc1.vector))
print(doc2.vector[:5],"...", "– Dimensions: ", len(doc1.vector))
# Resulting Similarity
print("")
print("Similarity: ",doc1.similarity(doc2))

[ 0.24403 -3.4729  -2.9564   1.0566  -1.0356 ] ... – Dimensions:  300
[ 0.85765 -3.1901  -2.6978   1.3871  -2.6844 ] ... – Dimensions:  300

Similarity:  0.8424929889814099


In [6]:
doc3 = nlp("Hund Katze Maus Elefant Ratte Tiger Löwe")
sim_list = [(token1.text,token2.text,token1.similarity(token2)) for token2 in doc3 for token1 in doc3]
sim_list[:3]
for token in doc3:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)     # oov: hat ein token einen vector? has_vector: Ist ein Vector mit dem token assoziiert? --> Hä

Hund True 57.446507 False
Katze True 43.214478 False
Maus True 44.95886 False
Elefant True 24.972677 False
Ratte True 32.157112 False
Tiger True 34.42378 False
Löwe True 39.592545 False


In [7]:
df = pd.DataFrame(sim_list)
df.columns = ["Token 1", "Token 2", "Similarity"]
df.head()

Unnamed: 0,Token 1,Token 2,Similarity
0,Hund,Hund,1.0
1,Katze,Hund,0.676083
2,Maus,Hund,0.381736
3,Elefant,Hund,0.441017
4,Ratte,Hund,0.516697


In [19]:
# Check similarities without adjusted training data
# Load the vocab of the recipes
# Ich weiss nicht ob der Kontext wichtig ist, falls ja ist das hier Bullshit
tokens_df = pd.read_pickle("/Users/Leonidas/Universität St.Gallen/STUD-Capstoneproject Tell 6 - General/Coding/cleaned_tokens")

In [62]:
print(len(tokens_df))
# Remove duplicates
tokens = list(set(tokens_df[0]))
print(len(tokens))

677
412


In [63]:
tokens = [ nlp(token) for token in tokens ]

In [64]:
word_vectors = [ (token, token.vector_norm) for token in tokens]

In [104]:
# Print the most similar for each token
trim = 20
for query in tokens[:trim]:
    print("Our word: ", query)
    closest = (None, 0)
    for token in tokens:
        if query.similarity(token) > closest[1] and query.similarity(token) != 1:
            closest = (token.text, query.similarity(token))
    print("Highest Similarity: ", closest)
    if closest[1] > 0.85:
            print("*")

Our word:  Grad
Highest Similarity:  ('Minuten', 0.4862608255737344)
Our word:  Spargelstücke
Highest Similarity:  ('Spargelspitzen', 0.7774956571528802)
Our word:  min
Highest Similarity:  ('Min', 0.8343673976416094)
Our word:  bringen
Highest Similarity:  ('ziehen', 0.6820054354857157)
Our word:  vermengen
Highest Similarity:  ('vermischen', 0.8338406724168218)
Our word:  Sauce
Highest Similarity:  ('Marinade', 0.7751357985489428)
Our word:  Vanillemark
Highest Similarity:  ('Vanillezucker', 0.8122022587341052)
Our word:  gegriffen
Highest Similarity:  ('reichen', 0.520744661664803)
Our word:  abtropfen
Highest Similarity:  ('abgießen', 0.772762118240911)
Our word:  Sauerkraut
Highest Similarity:  ('Kartoffeln', 0.7530034767192818)
Our word:  weglassen
Highest Similarity:  ('abziehen', 0.6650948011533647)
Our word:  köcheln
Highest Similarity:  ('Köcheln', 0.7658935677630981)
Our word:  Suppe
Highest Similarity:  ('Brühe', 0.7615803094716043)
Our word:  Sauerkirschen
Highest Similari

In [143]:
# Similarity Matrix
trim = min(len(tokens), 200)
round_to = 2 # Nachkommastellen
mat = np.zeros((trim,trim))
print(mat)
for i,token1 in enumerate(tokens[:trim]):
    for k,token2 in enumerate(tokens[:trim]):
        mat[i,k] = round(token1.similarity(token2),round_to)
print(mat)
names = [ token.text for token in tokens[:trim]]
sims = pd.DataFrame(mat, columns = names, index = names)
# sims.index
sims

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[ 1.    0.15  0.4  ... -0.08  0.1   0.04]
 [ 0.15  1.    0.07 ...  0.13  0.49  0.06]
 [ 0.4   0.07  1.   ... -0.17  0.07  0.1 ]
 ...
 [-0.08  0.13 -0.17 ...  1.   -0.08  0.05]
 [ 0.1   0.49  0.07 ... -0.08  1.    0.15]
 [ 0.04  0.06  0.1  ...  0.05  0.15  1.  ]]


Unnamed: 0,Grad,Spargelstücke,min,bringen,vermengen,Sauce,Vanillemark,gegriffen,abtropfen,Sauerkraut,...,Zucker,feine,steif,Ca,allerbesten,Baguette,Schiene,mehreren,Sauerrahm,hart
Grad,1.00,0.15,0.40,0.03,0.20,0.10,0.06,0.02,0.20,0.12,...,0.17,0.01,0.13,0.07,0.00,0.07,0.15,-0.08,0.10,0.04
Spargelstücke,0.15,1.00,0.07,0.17,0.48,0.58,0.48,0.18,0.49,0.57,...,0.47,0.37,0.13,0.04,0.28,0.51,0.26,0.13,0.49,0.06
min,0.40,0.07,1.00,-0.09,0.04,0.05,0.09,-0.13,0.15,0.06,...,0.08,-0.03,0.14,0.23,-0.15,0.07,0.10,-0.17,0.07,0.10
bringen,0.03,0.17,-0.09,1.00,0.48,0.03,0.03,0.43,0.20,0.13,...,0.09,0.21,-0.01,-0.02,0.34,0.02,0.11,0.24,-0.03,0.08
vermengen,0.20,0.48,0.04,0.48,1.00,0.34,0.38,0.44,0.53,0.35,...,0.45,0.25,0.14,0.02,0.35,0.16,0.23,0.40,0.24,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Baguette,0.07,0.51,0.07,0.02,0.16,0.54,0.42,0.07,0.35,0.54,...,0.33,0.32,0.12,0.04,0.18,1.00,0.17,-0.02,0.54,0.10
Schiene,0.15,0.26,0.10,0.11,0.23,0.15,0.06,0.20,0.20,0.15,...,0.16,0.12,0.23,-0.01,0.01,0.17,1.00,0.09,0.17,0.17
mehreren,-0.08,0.13,-0.17,0.24,0.40,-0.00,0.02,0.21,0.09,0.09,...,0.08,0.13,0.01,0.04,0.41,-0.02,0.09,1.00,-0.08,0.05
Sauerrahm,0.10,0.49,0.07,-0.03,0.24,0.62,0.57,0.01,0.45,0.67,...,0.49,0.19,0.26,0.01,0.08,0.54,0.17,-0.08,1.00,0.15


In [147]:
# Get the n mst similar words
word = "vermengen"
count = 5
sims.nlargest(count+1, word)[word].iloc[1:] # Cut irrelvant columns and the similarity to itself (always 1.00 thus index 1)

vermischen    0.83
verrühren     0.82
pürieren      0.78
unterheben    0.76
marinieren    0.72
Name: vermengen, dtype: float64

In [137]:
# Adjust training data / word vectors with cooking vocab