In [1]:
#importing nltk
import nltk 
import pandas as pd

In [2]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\guilh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\guilh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\guilh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk import pos_tag
from nltk.tokenize.treebank import TreebankWordDetokenizer
lemmatizer = WordNetLemmatizer()

In [4]:
sentence = '''
baby
'''

In [5]:
tokenized = word_tokenize(sentence)
tagged  = pos_tag(tokenized)
tagged


[('baby', 'NN')]

In [6]:
def synonym_tagger (words):
    tagged = pos_tag(words)
    results = []
    for word, tag in tagged:
        if tag.startswith('N'): results.append((word, 'n'))
        if tag.startswith('V'): results.append((word, 'v'))
        if tag.startswith('R'): results.append((word, 'r'))
        # this 'a' tag is not recognized in wornet synset and I don't know why
        # if tag.startswith('JJ'): results.append((word, 'a'))
    return results

def wordnet_tag(tag):
    if tag == 'n': return wordnet.NOUN
    if tag == 'v': return wordnet.VERB
    if tag == 'r': return wordnet.ADV
    # this 'a' tag is not recognized in wornet synset and I don't know why
    # if tag == 'a': return wordnet.ADJ


In [7]:
tagged_proper = synonym_tagger(tokenized)
tagged_proper

[('baby', 'n')]

In [8]:
def find_synonyms(word_tagged, dict, treshold):
    word, tag = word_tagged
    num_syns = 0
    lemma = lemmatizer.lemmatize(word)
    try:
        word_syn = wordnet.synset(lemma + '.' + tag + '.01')

        for syn in wordnet.synsets(word, wordnet_tag(tag)):

            similarity = word_syn.wup_similarity(syn)
            if similarity < treshold: break

            for lemma in syn.lemmas():
                if lemma.name() not in [x[0] for x in dict[word]]: 
                    dict[word].append((lemma.name(), similarity))
    except:
        pass


In [9]:
syn_dict = {}
for word, tag in tagged_proper: syn_dict[word] = []

for word_tag in tagged_proper:
    find_synonyms(word_tag, syn_dict, 0.5)

In [10]:
syn_dict

{'baby': [('baby', 1.0),
  ('babe', 1.0),
  ('infant', 1.0),
  ('child', 0.631578947368421),
  ('sister', 0.5454545454545454)]}

actually generating sentences based on the synonyms created


In [11]:
df = pd.DataFrame(columns=['word', 'syn', 'tag', 'sim'])

for word in syn_dict.keys():
    for syn in syn_dict[word]:
        row = {
            'word': word, 
            'syn': syn[0],
            'tag': dict(tagged_proper)[word],
            'sim': syn[1]
        }
        df = df.append(row, ignore_index=True)


df.head()

Unnamed: 0,word,syn,tag,sim
0,baby,baby,n,1.0
1,baby,babe,n,1.0
2,baby,infant,n,1.0
3,baby,child,n,0.631579
4,baby,sister,n,0.545455


In [12]:
# df = pd.concat([df, pd.get_dummies(df['tag'], columns=['n','v','r','a'])], axis='columns').reset_index(drop=True).drop(columns=['tag'])
df2 = pd.get_dummies(df, columns=['tag'], prefix=None)
tags = ['tag_n', 'tag_v', 'tag_r', 'tag_a']
for tag in tags:
    if tag not in df2: df2[tag] = 0

df2

Unnamed: 0,word,syn,sim,tag_n,tag_v,tag_r,tag_a
0,baby,baby,1.0,1,0,0,0
1,baby,babe,1.0,1,0,0,0
2,baby,infant,1.0,1,0,0,0
3,baby,child,0.631579,1,0,0,0
4,baby,sister,0.545455,1,0,0,0


In [13]:
df = pd.get_dummies(df, columns=['tag'])

# Check if there are missing columns and create them
tags = ['tag_n', 'tag_v', 'tag_r', 'tag_a']
for tag in tags: 
    if tag not in df: df[tag] = 0

substitution_order_preference = ['tag_r', 'tag_n', 'tag_v']
df.sort_values(substitution_order_preference + ['sim'], ascending=False,  ignore_index=True, inplace=True)

df.head()

Unnamed: 0,word,syn,sim,tag_n,tag_v,tag_r,tag_a
0,baby,baby,1.0,1,0,0,0
1,baby,babe,1.0,1,0,0,0
2,baby,infant,1.0,1,0,0,0
3,baby,child,0.631579,1,0,0,0
4,baby,sister,0.545455,1,0,0,0


In [14]:
# trying to delete repeating rows
df = df[df['word'] != df['syn']].reset_index()

In [15]:
df.head()

Unnamed: 0,index,word,syn,sim,tag_n,tag_v,tag_r,tag_a
0,1,baby,babe,1.0,1,0,0,0
1,2,baby,infant,1.0,1,0,0,0
2,3,baby,child,0.631579,1,0,0,0
3,4,baby,sister,0.545455,1,0,0,0


In [16]:
def sentence_generator(sentence, synonym_dict, num_sentences):
    sentences = []
    sentence_counter = 0

    tokenized = word_tokenize(sentence)

    for index in range(df.shape[0]):
        word = df.at[index, 'word']
        syn = df.at[index, 'syn']
        word_index = tokenized.index(word)
        sent = [t for t in tokenized]
        sent[word_index] = syn.replace('_', ' ')
        untokenized_sentence = TreebankWordDetokenizer().detokenize(sent)
        sentences.append(untokenized_sentence)
        sentence_counter += 1

        if sentence_counter >= num_sentences: break

    return sentences


In [17]:
sentence_generator(sentence, syn_dict, 10)

['babe', 'infant', 'child', 'sister']

# Testing on real methods


In [18]:
from similar_sentence_generator import generate_sentences

In [19]:
import pickle

In [20]:
df = pd.read_pickle('hand_classified_methods/classified_methods')
df.head()

Unnamed: 0,description,Measuring,Plating,Smoking,Toasting,Microwaving,Air Frying,Double Boiler,Bain Marie,Reducing,...,Seasoning,Salting,Slicing,Chopping Fruits,Chopping Mushroom,Chopping Herbs,Mincing,Batonnet,Dicing,Roughly Chopping
0,"Put the mushrooms, chard, oil, garlic, chilli,...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Bring a large pan of salted water to the boil,...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Add the drained pasta and chopped tomatoes to ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Toast the cumin seeds, fennel seeds and black ...",0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Transfer to a mortar and pestle and grind to a...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
row=6
method = df.at[row,'description']
print(method)
generate_sentences(method, 5,similarity_treshold=0.9)

Add more lemon juice, garlic, cumin or salt to taste. Turn out into a dinner plate, and make smooth with the back of a spoon. Drizzle with extra virgin olive oil and scatter with the reserved chickpeas. 


['Add to a greater extent lemon juice, garlic, cumin or salt to taste . Turn out into a dinner plate, and make smooth with the back of a spoon . Drizzle with extra virgin olive oil and scatter with the reserved chickpeas.',
 'attention deficit disorder more lemon juice, garlic, cumin or salt to taste . Turn out into a dinner plate, and make smooth with the back of a spoon . Drizzle with extra virgin olive oil and scatter with the reserved chickpeas.',
 'ADD more lemon juice, garlic, cumin or salt to taste . Turn out into a dinner plate, and make smooth with the back of a spoon . Drizzle with extra virgin olive oil and scatter with the reserved chickpeas.',
 'attention deficit hyperactivity disorder more lemon juice, garlic, cumin or salt to taste . Turn out into a dinner plate, and make smooth with the back of a spoon . Drizzle with extra virgin olive oil and scatter with the reserved chickpeas.',
 'ADHD more lemon juice, garlic, cumin or salt to taste . Turn out into a dinner plate, a

['Add the drained pasta and chopped tomatoes to the roasting tin and mix good . Scatter over the remaining sage, feta and breadcrumbs and bake for a further 30 minutes, or until golden and bubbling . Serve hot.',
 'Add the drained pasta and chopped tomatoes to the roasting tin and mix easily . Scatter over the remaining sage, feta and breadcrumbs and bake for a further 30 minutes, or until golden and bubbling . Serve hot.',
 'Add the drained pasta and chopped tomatoes to the roasting tin and mix considerably . Scatter over the remaining sage, feta and breadcrumbs and bake for a further 30 minutes, or until golden and bubbling . Serve hot.',
 'Add the drained pasta and chopped tomatoes to the roasting tin and mix substantially . Scatter over the remaining sage, feta and breadcrumbs and bake for a further 30 minutes, or until golden and bubbling . Serve hot.',
 'Add the drained pasta and chopped tomatoes to the roasting tin and mix intimately . Scatter over the remaining sage, feta and b