In [2]:
#code from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

In [3]:
levenshtein("I am Ge Gao, a cs student", "Ge Gao is a cs major student")

14

In [30]:
from nltk.corpus import wordnet as wn
from nltk import word_tokenize, pos_tag   ##implementation of similarity() from http://nlpforhackers.io/wordnet-sentence-similarity/
def penn_to_wn(tag):
    """ Convert between a Penn Treebank tag to a simplified Wordnet tag """
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
def tagged_to_synset(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
 
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None
def pathsim(ss1, ss2):
    if ss1.path_similarity(ss2) is None:
        return 0
    return ss1.path_similarity(ss2)
def sentence_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
    # Tokenize and tag
    NoneType = type(None)
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
 
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if not type(ss) == NoneType]
    synsets2 = [ss for ss in synsets2 if not type(ss) == NoneType]
 
    score, count = 0.0, 0
    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        best_score = max([pathsim(synset, ss) for ss in synsets2])
 
        # Check that the similarity could have been computed
        if best_score is not None:
            score += best_score
            count += 1
 
    # Average the values
    score /= count
    return score
sentences = [
    "Dogs are awesome.",
    "Some gorgeous creatures are felines.",
    "Dolphins are swimming mammals.",
    "Cats are beautiful animals.",
]
 
focus_sentence = "Hi, Siri."
 
for sentence in sentences:
    print("Similarity(\"%s\", \"%s\") = %s",focus_sentence, sentence, sentence_similarity(focus_sentence, sentence))
    print("Similarity(\"%s\", \"%s\") = %s",sentence, focus_sentence, sentence_similarity(sentence, focus_sentence))
    print 
 

Similarity("%s", "%s") = %s Hi, Siri. Dogs are awesome. 0.06666666666666667
Similarity("%s", "%s") = %s Dogs are awesome. Hi, Siri. 0.059259259259259255
Similarity("%s", "%s") = %s Hi, Siri. Some gorgeous creatures are felines. 0.07692307692307693
Similarity("%s", "%s") = %s Some gorgeous creatures are felines. Hi, Siri. 0.06016644174538911
Similarity("%s", "%s") = %s Hi, Siri. Dolphins are swimming mammals. 0.0625
Similarity("%s", "%s") = %s Dolphins are swimming mammals. Hi, Siri. 0.07976641414141414
Similarity("%s", "%s") = %s Hi, Siri. Cats are beautiful animals. 0.07692307692307693
Similarity("%s", "%s") = %s Cats are beautiful animals. Hi, Siri. 0.059508547008547005


In [31]:
print(sentence_similarity("dogs are awesome", "cats are beautiful animals"))

0.4444444444444444


In [125]:
import gensim
from gensim import corpora, models, similarities
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec
from scipy import spatial
import numpy
def avg_feature_vector(words, model, num_features): ##implementation from https://stackoverflow.com/questions/22129943/how-to-calculate-the-sentence-similarity-using-word2vec-model-of-gensim-with-pyt
        #function to average all words vectors in a given paragraph
        featureVec = numpy.zeros((num_features,), dtype="float32")
        nwords = 0

        #list containing names of words in the vocabulary
        #index2word_set = set(model.index2word) this is moved as input param for performance reasons
        for word in words:
            nwords = nwords+1
            print(featureVec)
            featureVec = numpy.add(featureVec, model.wv[word])
        print(nwords)
        if(nwords>0):
            featureVec = numpy.divide(featureVec, nwords)
        
        return featureVec


In [126]:
sentence_1 = "king"
sentence_1_avg_vector = avg_feature_vector(sentence_1.split(),model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True), num_features=300)

#get average vector for sentence 2
sentence_2 = "queen"
sentence_2_avg_vector = avg_feature_vector(sentence_2.split(), model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True), num_features=300)
sens = [sentence_1, sentence_2]

if not sentence_1_avg_vector.all() * sentence_2_avg_vector.all() == 0:
    print(spatial.distance.cosine(sentence_1_avg_vector,sentence_2_avg_vector))
    sen1_sen2_similarity =  1 - spatial.distance.cosine(sentence_1_avg_vector,sentence_2_avg_vector)
else:
    sen1_sen2_similarity = 1

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0

In [128]:
print("similarity: ")
print(sen1_sen2_similarity)

similarity: 
0.651095688343


In [None]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
print(model.similarity('word', 'sound'))