## Word2Vec associations

### Imports

In [1]:
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

from transformers import BertTokenizer, BertModel, pipeline

In [2]:
df = pd.read_csv('../data/recipes.csv').drop(columns=['Unnamed: 0'])
print(df.shape)

(761, 17)


---
### Cleaning for word2vec

In [3]:
# text cleaning function credit: https://towardsdatascience.com/creating-word-embeddings-coding-the-word2vec-algorithm-in-python-using-deep-learning-b337d0ba17a8
def clean_text(
    string: str, 
    punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
    stop_words=['the', 'a', 'and', 'is', 'be', 'will']) -> str:
    """
    A method to clean text 
    """
    # Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+', '', string)

    # Cleaning the html elements
    string = re.sub(r'<.*?>', '', string)

    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()
    
    # Remove linebreak
    string = string.replace('\r\n', '')

    return string  

In [4]:
df.head(1)

Unnamed: 0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,sustainable,lowFodmap,title,readyInMinutes,summary,cuisines,dishTypes,occasions,instructions,simplifiedIngredients,simplifiedInstructions
0,1,0,0,0,0,0,0,0,Gingerbread Mummies,45,Gingerbread Mummies might be just the dessert ...,[],['dessert'],['christmas'],"In a bowl of an electric mixer, beat the butte...","['unsalted butter', 'wheat flour', 'sugar', 'm...","['In a bowl of an electric mixer, beat the but..."


In [5]:
df['instructions'] = [clean_text(str(row)) for row in df['instructions']]

In [6]:
df['instructions']

0      in bowl of an electric mixer beat butter until...
1      preheat oven to 375 degrees f in large bowl cr...
2      preheat oven to 350 degrees f sift flour onto ...
3      preheat your oven to 400 degrees f add sliced ...
4      beat butter in large bowl in an electric mixer...
                             ...                        
756    prepare graham crust preheat oven to 350put gr...
757    preheat oven to 375 degrees in large skillet h...
758    preheat oven into 180c gently pound chicken br...
759    rinse chickpeas soak for 8 hours or overnight ...
760    oven 325f place six ramekins in water bath whi...
Name: instructions, Length: 761, dtype: object

---
## Word2Vec implementation

In [9]:
corpus = [each.split() for each in df['instructions']]

In [10]:
model = Word2Vec(corpus)

In [13]:
len(model.wv.index_to_key)

1555

In [12]:
model.wv.get_vector('flour')

array([ 0.02467003,  0.462431  ,  0.10952619, -0.36950684,  0.34555918,
       -0.39694858,  1.1344974 ,  0.51036364, -0.10377292, -0.24977541,
       -0.17449935, -0.7031323 ,  0.08217197, -0.56424326, -0.03330697,
       -0.16310018,  0.31470796, -0.260275  ,  0.3495805 , -0.42015603,
        0.22069149,  0.17347388,  0.25684553, -0.3124601 , -0.15932514,
        0.13784407, -0.64135724, -0.50323886, -0.12363603,  0.06647342,
        0.19679967,  0.19766821, -0.04574329, -0.13459839, -0.1052858 ,
        0.71360207,  0.12434079,  0.2325738 , -0.3467441 , -0.58889306,
        0.44267848, -0.43534586,  0.49761763, -0.18026364,  0.02424771,
       -0.08096733,  0.4785746 , -0.16669126,  0.16718604,  0.48217794,
        0.13288143, -0.5692791 ,  0.1732835 , -0.28098738, -0.03097262,
        0.25640464,  0.50001127,  0.4058189 , -0.16080895,  0.07115072,
        0.43439737, -0.33678323,  0.28753754,  0.01711828, -0.51507086,
        0.52001256, -0.06585953,  0.14276914, -0.13062638,  0.28