# Download necessary files
In Terminal, open Python and:
1. **import nltk**
2. **nltk.download('punkt')** for tokenizer
3. **nltk.download('averaged_perceptron_tagger')** for pos tagger
4. **nltk.download('wordnet')** for wordnet

In [1]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import pandas as pd
import numpy as np

In [2]:
text = 'Hello, I luckily found a wonderful notebook and I probably need interesting days to write about. Hmm, can you build a country?'
text

'Hello, I luckily found a wonderful notebook and I probably need interesting days to write about. Hmm, can you build a country?'

# 1. POS tagging
Reference: http://www.nltk.org/book/ch05.html

## 1) Split text into tokens

In [3]:
tokens = nltk.word_tokenize(text)
print(tokens)

['Hello', ',', 'I', 'luckily', 'found', 'a', 'wonderful', 'notebook', 'and', 'I', 'probably', 'need', 'interesting', 'days', 'to', 'write', 'about', '.', 'Hmm', ',', 'can', 'you', 'build', 'a', 'country', '?']


## 2) Lemmatize tokens

In [4]:
lmtzr = WordNetLemmatizer()
lmtzed_tokens = [lmtzr.lemmatize(token) for token in tokens]
print(lmtzed_tokens)

['Hello', ',', 'I', 'luckily', 'found', 'a', 'wonderful', 'notebook', 'and', 'I', 'probably', 'need', 'interesting', 'day', 'to', 'write', 'about', '.', 'Hmm', ',', 'can', 'you', 'build', 'a', 'country', '?']


## 3) POS tagging with NLTK

In [5]:
tagged_tokens = nltk.pos_tag(lmtzed_tokens)
print(tagged_tokens)

[('Hello', 'NNP'), (',', ','), ('I', 'PRP'), ('luckily', 'RB'), ('found', 'VBD'), ('a', 'DT'), ('wonderful', 'JJ'), ('notebook', 'NN'), ('and', 'CC'), ('I', 'PRP'), ('probably', 'RB'), ('need', 'VBP'), ('interesting', 'JJ'), ('day', 'NN'), ('to', 'TO'), ('write', 'VB'), ('about', 'RB'), ('.', '.'), ('Hmm', 'NNP'), (',', ','), ('can', 'MD'), ('you', 'PRP'), ('build', 'VB'), ('a', 'DT'), ('country', 'NN'), ('?', '.')]


## 4) POS tagging for WordNet

In [6]:
def get_wordnet_pos(nltk_pos):
    if nltk_pos.startswith('N'):   #Noun
        return 'n'
    elif nltk_pos.startswith('V'): #Verb
        return 'v'
    elif nltk_pos.startswith('J'): #Adjective
        return 'a'
    elif nltk_pos.startswith('R'): #Adverb
        return 'r'
    else:
        return None

In [7]:
summary = pd.DataFrame({'token': tokens, 
                        'lemmatized token': lmtzed_tokens, 
                        'nltk pos': [tt[1] for tt in tagged_tokens], 
                        'wordnet pos': [get_wordnet_pos(tt[1]) for tt in tagged_tokens]}, 
                       columns=['token', 'lemmatized token', 'nltk pos', 'wordnet pos'])
summary

Unnamed: 0,token,lemmatized token,nltk pos,wordnet pos
0,Hello,Hello,NNP,n
1,",",",",",",
2,I,I,PRP,
3,luckily,luckily,RB,r
4,found,found,VBD,v
5,a,a,DT,
6,wonderful,wonderful,JJ,a
7,notebook,notebook,NN,n
8,and,and,CC,
9,I,I,PRP,


# 2. Search synonyms

In [8]:
def get_synonyms(word, pos):
    if pos is None:
        return ''
    synsets = wn.synsets(word, pos)
    if len(synsets) == 0:
        return 'no synset in wordnet'
    else:
        synset = synsets[0]
        synonyms = synset.lemma_names()
        if word in synonyms:
            synonyms.remove(word)
        return synonyms

In [9]:
summary['synonyms'] = summary.apply(lambda row: get_synonyms(row['lemmatized token'], row['wordnet pos']), axis=1)
summary

Unnamed: 0,token,lemmatized token,nltk pos,wordnet pos,synonyms
0,Hello,Hello,NNP,n,"[hello, hullo, hi, howdy, how-do-you-do]"
1,",",",",",",,
2,I,I,PRP,,
3,luckily,luckily,RB,r,"[fortunately, fortuitously, as_luck_would_have..."
4,found,found,VBD,v,"[establish, set_up, launch]"
5,a,a,DT,,
6,wonderful,wonderful,JJ,a,"[fantastic, grand, howling, marvelous, marvell..."
7,notebook,notebook,NN,n,[]
8,and,and,CC,,
9,I,I,PRP,,


# 3. Replace randomly selected tokens with synonyms

In [10]:
def replace_with_wordnet_synonyms(text, n_tokens2replace):
    
    def wordnet_pos_tag(tokens):
        wordnet_pos_list = []
        for token, nltk_pos in nltk.pos_tag(tokens):
            wordnet_pos_list.append(get_wordnet_pos(nltk_pos))
        return wordnet_pos_list
    
    def get_synonyms(word, pos):
        synsets = wn.synsets(word, pos)
        if len(synsets) == 0:
            return []
        else:
            synset = synsets[0]
            synonyms = synset.lemma_names()
            if word in synonyms:
                synonyms.remove(word)
            return synonyms
    
    def pick_a_random_synonym_index(n_synonyms):
        prob = np.arange(0, n_synonyms) + 1.0
        prob = (prob / sum(prob))[::-1]
        return int(np.random.choice(n_synonyms, 1, p=prob))
    
    def swap_in_synonym(tokens, pos_list, i):
        token = tokens[i]
        pos = pos_list[i]
        if pos is None:
            return tokens
        synonyms = get_synonyms(token, pos)
        n_synonyms = len(synonyms)
        if n_synonyms > 0:
            index = pick_a_random_synonym_index(n_synonyms)
            tokens[i] = synonyms[index]
        return tokens
    
    tokens = nltk.word_tokenize(text)
    tokens = [lmtzr.lemmatize(token) for token in tokens]
    pos_list = wordnet_pos_tag(tokens)
    
    if n_tokens2replace >= len(tokens):
        for j in range(len(tokens)):
            if np.random.randint(0, 2, 1) == 1:
                swap_in_synonym(tokens, pos_list, j)
    else:
        n = len(tokens)
        index_to_swap = list(set(np.random.randint(0, n, n_tokens2replace)))
        for j in index_to_swap:
            swap_in_synonym(tokens, pos_list, j)
    
    new_tokens = []
    for token in tokens:
        if '_' in token:
            split_tokens_list = token.split('_')
            split_tokens_list = [lmtzr.lemmatize(split_token) for split_token in split_tokens_list]
            new_tokens += split_tokens_list
        else:
            new_tokens.append(token)

    return new_tokens

## Example of outputting 5 versions from 1 text

In [11]:
print('*** original lemmatized tokens ***\n')
print(lmtzed_tokens)
print('=' * 100)
for i in range(5):
    print('*** output #{} ***\n'.format(i+1))
    print(replace_with_wordnet_synonyms(text, 15))
    print('-' * 100)

*** original lemmatized tokens ***

['Hello', ',', 'I', 'luckily', 'found', 'a', 'wonderful', 'notebook', 'and', 'I', 'probably', 'need', 'interesting', 'day', 'to', 'write', 'about', '.', 'Hmm', ',', 'can', 'you', 'build', 'a', 'country', '?']
*** output #1 ***

['Hello', ',', 'I', 'fortunately', 'establish', 'a', 'grand', 'notebook', 'and', 'I', 'in', 'all', 'probability', 'involve', 'interesting', 'day', 'to', 'write', 'close', 'to', '.', 'Hmm', ',', 'can', 'you', 'construct', 'a', 'country', '?']
----------------------------------------------------------------------------------------------------
*** output #2 ***

['Hello', ',', 'I', 'luckily', 'set', 'up', 'a', 'terrific', 'notebook', 'and', 'I', 'likely', 'need', 'interesting', 'day', 'to', 'write', 'just', 'about', '.', 'Hmm', ',', 'can', 'you', 'construct', 'a', 'country', '?']
----------------------------------------------------------------------------------------------------
*** output #3 ***

['how-do-you-do', ',', 'I', 'luc