In [1]:
import pandas as pd
import random
import ast

## reading random sample of csv

In [4]:
random.seed(4321)
df=pd.read_csv('/kaggle/input/recipenlg/RecipeNLG_dataset.csv',
               converters={'NER': ast.literal_eval,'ingredients':ast.literal_eval,'directions':ast.literal_eval}, skiprows=lambda x: x > 0 and random.random() >=0.05)

In [5]:
print(len(df))

111636


In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,6,Rhubarb Coffee Cake,"[1 1/2 c. sugar, 1/2 c. butter, 1 egg, 1 c. bu...","[Cream sugar and butter., Add egg and beat wel...",www.cookbooks.com/Recipe-Details.aspx?id=210288,Gathered,"[sugar, butter, egg, buttermilk, flour, salt, ..."
1,50,Chicken Ole,"[4 chicken breasts, cooked, 1 can cream of chi...","[Dice chicken., Mix all ingredients together.,...",www.cookbooks.com/Recipe-Details.aspx?id=445786,Gathered,"[chicken breasts, cream of chicken soup, cream..."
2,87,Fast Real Good Fudge,"[4 lb. confectioners sugar, 1 c. cocoa, 1 c. c...","[In large bowl, put confectioners sugar, cocoa...",www.cookbooks.com/Recipe-Details.aspx?id=505741,Gathered,"[confectioners sugar, cocoa, nuts, margarine, ..."
3,93,Chicken Roll-Ups,"[1 (8 oz.) can crescent rolls, 1 1/2 c. grated...",[Unroll crescent rolls; add cooked chicken and...,www.cookbooks.com/Recipe-Details.aspx?id=449613,Gathered,"[crescent rolls, cheese, hen, cream of chicken..."
4,99,Brickle Bars,"[1 box yellow cake mix, 2 eggs, 1/3 c. soft ma...","[Mix together with a fork:, cake mix, 1 egg an...",www.cookbooks.com/Recipe-Details.aspx?id=351194,Gathered,"[yellow cake mix, eggs, margarine, milk, pecans]"


In [7]:
print(type(df['ingredients'][0]))

<class 'list'>


In [8]:
count=df.explode('NER')

In [9]:
distinct=len(pd.unique(count['NER']))

In [10]:
distinct

34750

## From seperated ingredients to sentence ingredients to feed to Word2vec

In [11]:
df['sentence_ingredients']=df['NER'].apply(lambda x:' '.join(x))

In [13]:
df['sentence_ingredients']

0         sugar butter egg buttermilk flour salt soda bu...
1         chicken breasts cream of chicken soup cream of...
2         confectioners sugar cocoa nuts margarine Velve...
3         crescent rolls cheese hen cream of chicken sou...
4                yellow cake mix eggs margarine milk pecans
                                ...                        
111631    tomatoes zucchini Stove Top Stuffing Mix chedd...
111632    chicken broth water orzo pasta fresh asparagus...
111633    lemon juice Italian salad dressing salad oil c...
111634                               Chicken koji Olive oil
111635                                Eggs Water Water Salt
Name: sentence_ingredients, Length: 111636, dtype: object

In [13]:
https://medium.com/red-buffer/doc2vec-computing-similarity-between-the-documents-47daf6c828cd

In [14]:
pip install gensim==4.0.1

[0mNote: you may need to restart the kernel to use updated packages.


## Building Doc2vec model

In [2]:
import gensim
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec



In [16]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(df['sentence_ingredients'])]

In [None]:
print (tagged_data)

In [19]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=30, min_count=2, epochs=80)

In [20]:
model.build_vocab(tagged_data)

In [21]:
model.train(tagged_data, total_examples=model.corpus_count, epochs=80)

In [22]:
model.save("d2v.model")

## Saving CSV for later inferences

In [4]:
df=pd.read_csv('data.csv')

## Loading Trained model

In [5]:
model = Doc2Vec.load("d2v.model")

## Trying dataset sample inference

In [24]:
similar_doc = model.docvecs.most_similar('0')
print(similar_doc[0])

('19184', 0.8977519869804382)


  """Entry point for launching an IPython kernel.


In [29]:
tagged_data[0]


TaggedDocument(words=['sugar', 'butter', 'egg', 'buttermilk', 'flour', 'salt', 'soda', 'buttermilk', 'rhubarb', 'vanilla'], tags=['0'])

In [7]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,sentence_ingredients
0,6,Rhubarb Coffee Cake,"['1 1/2 c. sugar', '1/2 c. butter', '1 egg', '...","['Cream sugar and butter.', 'Add egg and beat ...",www.cookbooks.com/Recipe-Details.aspx?id=210288,Gathered,"['sugar', 'butter', 'egg', 'buttermilk', 'flou...",sugar butter egg buttermilk flour salt soda bu...


In [28]:
tagged_data[19184]

TaggedDocument(words=['brown', 'sugar', 'shortening', 'egg', 'salt', 'buttermilk', 'vanilla', 'flour', 'rhubarb'], tags=['19184'])

In [36]:
df.iloc[[19184]]

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,sentence_ingredients
19184,385296,Rhubarb Cake,"[1 1/2 c. brown sugar, 1/2 c. shortening, 1 eg...","[Mix ingredients (except flour and rhubarb), t...",www.cookbooks.com/Recipe-Details.aspx?id=1064570,Gathered,"[brown sugar, shortening, egg, salt, buttermil...",brown sugar shortening egg salt buttermilk van...


## User sentence inference

In [6]:
inference="Hi i have garlic, rice, tuna, and parsley"
inference=gensim.utils.simple_preprocess(inference, deacc=True)
#test_data = word_tokenize(inference.lower())

is_noun = lambda pos: pos[:2] == 'NN'

nouns = [word for (word, pos) in nltk.pos_tag(inference) if is_noun(pos)] 
nouns = set(nouns)
print(list(nouns))


v1 = model.infer_vector(list(nouns))
print("V1_infer", v1)

['parsley', 'tuna', 'hi', 'rice']
V1_infer [-0.9552876  -0.7347755  -0.51856655 -1.0541384   0.5069083   0.54076976
 -1.0236523  -0.02430548 -0.07206278 -1.2545625  -0.43070257  0.10354666
  1.2647878   0.56946796  0.6260673   0.298627    1.0838041  -0.36423108
 -0.54128575  0.09966007 -0.34308055  0.4881523   0.38429958 -1.3928645
  0.5233941  -0.63566804 -0.1613008   0.02321183  0.5563536  -0.20038882]


In [7]:
similar_docs = model.docvecs.most_similar(positive=[v1])

  similar_docs = model.docvecs.most_similar(positive=[v1])


In [8]:
print(similar_docs)

[('45412', 0.7869112491607666), ('28322', 0.7734383344650269), ('24806', 0.7513725757598877), ('39178', 0.7499082684516907), ('97677', 0.7456322312355042), ('66570', 0.7446256875991821), ('105632', 0.7404688000679016), ('44063', 0.7341943383216858), ('84023', 0.7324774265289307), ('311', 0.7274543046951294)]


In [9]:
indexes=[i[0] for i in similar_docs]


In [10]:
print(indexes)

['45412', '28322', '24806', '39178', '97677', '66570', '105632', '44063', '84023', '311']


In [11]:
pd.set_option("display.max_colwidth", 100)

In [12]:
for i in indexes:
    print(df.iloc[[int(i)]][['link','NER']])

                                                      link  \
45412  www.allrecipes.com/recipe/17696/shrimp-creole-bake/   

                                                                                                       NER  
45412  ['water', 'white rice', 'onions', 'mushrooms', 'carrots', 'green bell pepper', 'celery', 'zucchi...  
                                                  link  \
28322  www.cookbooks.com/Recipe-Details.aspx?id=966100   

                                                                                                       NER  
28322  ['oleo', 'onions', 'frozen peas', 'milk', 'cream of chicken soup', 'pepper', 'tuna', 'parsley', ...  
                                                  link  \
24806  www.cookbooks.com/Recipe-Details.aspx?id=161542   

                                                                           NER  
24806  ['cheese soup', 'milk', 'rice', 'tuna', 'parsley', 'corn flake crumbs']  
                                           