## Similar sentences generator.

- This notebook tries to generate similar sentences to a given input sentence. 
- It makes use of WordNet and GloVe embeddings to arrive at substitute words for candidate words in a sentence.
- Similarity threshold, and number of sentences generated can be controlled.

#### Load Glove Vectors 

In [1]:
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import numpy as np
from numpy.linalg import norm

In [2]:
import os
import shutil
import smart_open
from sys import platform

import gensim


def prepend_line(infile, outfile, line):
    """ 
    Function use to prepend lines using bash utilities in Linux. 
    (source: http://stackoverflow.com/a/10850588/610569)
    """
    with open(infile, 'r') as old:
        with open(outfile, 'w') as new:
            new.write(str(line) + "\n")
            shutil.copyfileobj(old, new)

def prepend_slow(infile, outfile, line):
    """
    Slower way to prepend the line by re-creating the inputfile.
    """
    with open(infile, 'r') as fin:
        with open(outfile, 'w') as fout:
            fout.write(line + "\n")
            for line in fin:
                fout.write(line)


def get_lines(glove_file_name):
    """Return the number of vectors and dimensions in a file in GloVe format."""
    with smart_open.smart_open(glove_file_name, 'r') as f:
        num_lines = sum(1 for line in f)
    with smart_open.smart_open(glove_file_name, 'r') as f:
        num_dims = len(f.readline().split()) - 1
    return num_lines, num_dims

# Input: GloVe Model File
# More models can be downloaded from http://nlp.stanford.edu/projects/glove/
glove_file='./glove.6B/glove.6B.300d.txt'

num_lines, dims = get_lines(glove_file)

# Output: Gensim Model text format.
gensim_file='./glove.6B/glove.6B.300d.new.txt'
gensim_first_line = "{} {}".format(num_lines, dims)

# Prepends the line.
if platform == "linux" or platform == "linux2":
    prepend_line(glove_file, gensim_file, gensim_first_line)
else:
    prepend_slow(glove_file, gensim_file, gensim_first_line)

In [3]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format(gensim_file,binary=False)

## NLTK + WordNet

In [4]:
# nltk is used to perform POS tagging
import nltk 
from nltk.corpus import wordnet
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [5]:
def fetch_pos_identity(pos_tag):
    '''
    This method returns
    1. 'np' for proper nouns, 'n' for all other nouns
    2. 'a' for adjectives    
    3. 'v' for verbs    
    4. 'r' for adverbs    
    5. None for all other tags
    '''
    if pos_tag in ['NNP', 'NNPS']:
        return 'np'
    elif pos_tag in ['NN', 'NNS']:
        return 'n'
    elif pos_tag in ['JJ', 'JJR', 'JJS']:
        return 'a'
    elif pos_tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return 'v'
    elif pos_tag in ['RB', 'RBR', 'RBS']:
        return 'r'
    else:
        return None

In [6]:
from nltk.corpus import wordnet
for result in wordnet.synsets('friend'):
    print(result.name())
    print(result.definition())
    print(result.examples())

friend.n.01
a person you know well and regard with affection and trust
['he was my best friend at the university']
ally.n.02
an associate who provides cooperation or assistance
["he's a good ally in fight"]
acquaintance.n.03
a person with whom you are acquainted
['I have trouble remembering the names of all my acquaintances', 'we are friends of the family']
supporter.n.01
a person who backs a politician or a team etc.
['all their supporters came out for the game', 'they are friends of the library']
friend.n.05
a member of the Religious Society of Friends founded by George Fox (the Friends have never called themselves Quakers)
[]


In [7]:
def get_related_words(word, pos_tag, similarity_threshold):
    
    '''
    This method returns most similar words to the word passed.
    
    args:    
    word = input word
    pos_tag = Simple POS tag of the word
    similarity_threshold (float) = Value between 0 and 1. Indicates the similarity threshold to consider
    
    returns:    
    a list of similar words, along with the original word
    '''
    word = lemmatizer.lemmatize(word, pos_tag)
    synonyms = [word] 
    
    try:
        vector_check = glove_model.get_vector(word)
    except:
        return synonyms

    for syn in wordnet.synsets(word): 
        for l in syn.lemmas():
            if l.name() in synonyms:
                continue
            try:
                if l.name() in synonyms:
                    continue
                vector_prospect = glove_model.get_vector(l.name())
                A = np.array(vector_check)
                B = np.array(vector_prospect)
                cosine_diff = np.dot(A,B)/(norm(A)*norm(B))
                if cosine_diff > similarity_threshold:
                    synonyms.append(l.name())
            except:                
                pass
    return synonyms

In [8]:
def get_next_position(total_synonym_array, position_array, last_position):
    '''    
    This method returns the next position of word replacement.
    
    args:
    total_synonym_array = Array containing the total length of synonyms
    position_array = Array containing current positions
    last_position_array = Integer
    
    returns:
    next position to be updated, -1 if all positions are exhausted
    '''
    new_pos = last_position
    for i in range(len(total_synonym_array)):
        new_pos = (new_pos + 1) % len(total_synonym_array)
        if position_array[new_pos] == -1 or position_array[new_pos] == total_synonym_array[new_pos]:
            continue
        else:
            return new_pos
    return -1

In [9]:
def get_position_arrays(sentence_combination):
    '''
    This is a utility method to get position arrays.
    
    args:    
    sentence_combination = [[word], [word1, word2, ]]
    
    returns:    
    two position arrays
    '''
    total_synonym_array = []
    initial_position_array = []    
    for each_word_array in sentence_combination:
        length = len(each_word_array)
        total_synonym_array.append(length)
        if length == 1:
            initial_position_array.append(-1)
        else:
            initial_position_array.append(0)
    
    return total_synonym_array, initial_position_array

In [10]:
def provide_alternate_sentence(sentence, num_versions=1, max_changes=1, similarity_threshold=0.7, ignore_stopwords=True, ignore_proper_nouns=True):
    '''
    This method returns an alternate version(s) of the sentence passed by replacing words with their closest synonyms.
    
    args:
    sentence (String) = the input sentence
    num_versions (int) = the number of alternate versions required
    max_changes (int) = the maximum number of changes between versions
    similarity_threshold (float) = Value between 0 and 1. Indicates the similarity threshold to consider while replacing words
    ignore_stopwords (bool) = If True, stopwords will not be considered for replacement
    ignore_proper_nouns (bool) = If True, proper nouns will be ignored for replacement
    
    returns:
    list of alternate sentence(s)
    '''
    alternate_sentences = []    
    sentence_combination = []
    words = sentence.split()
    pos_tags = nltk.pos_tag(words)
    
    # Getting Postion tags and possible combinations with synonyms
    for each_word_pos in pos_tags:        
        word = each_word_pos[0]
        pos_tag = each_word_pos[1]
        short_pos = fetch_pos_identity(pos_tag) # if POS is noun, adj, adv, or verb - return pos tag.
        
        if ignore_proper_nouns and 'np' == short_pos:
            sentence_combination.append([word])
            continue        
        if short_pos is not None:
            word_lemmatized = lemmatizer.lemmatize(word, short_pos)
        else:
            word_lemmatized = lemmatizer.lemmatize(word)
        if ignore_stopwords and (word_lemmatized in stop_words or word in stop_words):
            sentence_combination.append([word])
            continue        
        if short_pos is not None:
            sentence_combination.append(get_related_words(word, short_pos, similarity_threshold))
        else:
            sentence_combination.append([word])
            continue
    
    # Getting number of possible combinations and indexes
    total_synonym_array, position_array = get_position_arrays(sentence_combination)
    total_combos_possible = 0
    for some_value in total_synonym_array:
        if some_value > 1:
            total_combos_possible = total_combos_possible + some_value            
    total_combos_possible = total_combos_possible - 1
    last_position = -1
    
    # Generating alternative sentences    
    for i in range(num_versions):        
        if i >= total_combos_possible:
            break        
        position = get_next_position(total_synonym_array, position_array, last_position)
        if position == -1:
            break        
        alt_sentence = ''
        counter = 0
        for j in sentence_combination:
            alt_sentence = alt_sentence + ' '
            if counter == position:
                alt_sentence = alt_sentence + j[position_array[position]]
                position_array[position] = position_array[position] + 1                
                last_position = position
            else:
                if position_array[counter] > -1:
                    alt_sentence = alt_sentence + j[position_array[counter] - 1]
                else:
                    alt_sentence = alt_sentence + j[position_array[counter]]
            counter = counter + 1        
        alt_sentence = alt_sentence.strip()
        alternate_sentences.append(alt_sentence)
    return alternate_sentences

## Test

In [11]:
provide_alternate_sentence('We collect your information regularly', num_versions=4, similarity_threshold=0.2)

['We collect your data regularly',
 'We collect your information regularly',
 'We accumulate your information regularly',
 'We accumulate your info regularly']

### Data

In [12]:
import pandas as pd
import re

In [13]:
ROOT = '/home/mluser/users'
data_subjects = os.path.join(ROOT, 'data', 'subject_line')
print(data_subjects)

/home/mluser/users/data/subject_line


In [14]:
result = []
for (r, d, f) in os.walk(data_subjects):
    df = pd.read_parquet(os.path.join(r, f[15]))
    print(len(f))
#     for i in f:
#         result.append(pd.read_parquet(os.path.join(r, i)))

30


In [15]:
data = list(set(df['subject'].dropna()))

In [16]:
len(data)

135047

In [17]:
final = []
for d in data:
    k = [l for l in d if l=="{" or l=="}" or l=="[" or l=="]" or l=="(" or l==")"]
    if len(k)<4:
        if "Lewisville" not in d:
            final.append(d.strip())
data = final

In [18]:
MAX_LENGTH = 20

def filterSubject(p):
    return len(p.split(' ')) < MAX_LENGTH

def filterSubjects(subs):
    return [sub for sub in subs if filterSubject(sub)]

In [19]:
print(len(data))
data = filterSubjects(data)

93660


In [20]:
def normalizeString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [21]:
data = [normalizeString(d) for d in data]

In [22]:
len(data)

87843

In [23]:
def visualize(d, n=4, s=0.5):
    final = [provide_alternate_sentence(i, num_versions=n, similarity_threshold=s) for i in d]
    for i, (d, d1) in enumerate(zip(d, final)):
        print('Subject', i,': ', d)
        for k in d1:
            print(k)
        print()
    return final

In [24]:
out = visualize(data[:5])

Subject 0 :  hurry you re still credit qualified to refinance !
hurry you re still credit qualify to refinance !
hurry you re however credit qualify to refinance !
hurry you re nevertheless credit qualify to refinance !
hurry you re yet credit qualify to refinance !

Subject 1 :  new york and company off cash back
new york and company off cash back

Subject 2 :  unitedhealth group health wellness benefits summary plan description
unitedhealth group health wellness benefit summary plan description
unitedhealth group health wellness benefit summary program description

Subject 3 :  recordatorio utiliza zelle para enviar y recibir dinero en un momento

Subject 4 :   last chance ! weeks of sunday home delivery just 
last happen ! week of sunday house delivery just
last chance ! week of sunday house delivery just
last chance ! week of sunday home delivery just
end chance ! week of sunday home delivery just

