In [1]:
import numpy as np
import pandas as pd

In [2]:
sentences = [
    'Molly ate a donut',
    'Molly ate a fish',
    'Jen consumed a carp',
    'Lenny fears the lions'
]

print('\n'.join(sentences))

Molly ate a donut
Molly ate a fish
Jen consumed a carp
Lenny fears the lions


In [3]:
from sklearn.feature_extraction.text import CountVectorizer 

vectorizer = CountVectorizer(binary=True)
matrix = vectorizer.fit_transform(sentences)
counts = pd.DataFrame(
    matrix.toarray(),
    index=sentences,
    columns=vectorizer.get_feature_names_out())
counts

Unnamed: 0,ate,carp,consumed,donut,fears,fish,jen,lenny,lions,molly,the
Molly ate a donut,1,0,0,1,0,0,0,0,0,1,0
Molly ate a fish,1,0,0,0,0,1,0,0,0,1,0
Jen consumed a carp,0,1,1,0,0,0,1,0,0,0,0
Lenny fears the lions,0,0,0,0,1,0,0,1,1,0,1


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the similarities using the word counts
similarities = cosine_similarity(matrix)

# Make a fancy colored dataframe about it
pd.DataFrame(similarities,
             index=sentences,
             columns=sentences) \
            .style \
            .background_gradient(axis=None)

Unnamed: 0,Molly ate a donut,Molly ate a fish,Jen consumed a carp,Lenny fears the lions
Molly ate a donut,1.0,0.666667,0.0,0.0
Molly ate a fish,0.666667,1.0,0.0,0.0
Jen consumed a carp,0.0,0.0,1.0,0.0
Lenny fears the lions,0.0,0.0,0.0,1.0


In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [10]:
nlp('cat').vector

array([ 2.0860276 ,  0.7803842 ,  0.20159206, -1.28282   , -1.5474837 ,
        0.47157538,  0.3883854 ,  0.7717205 ,  1.1035886 , -0.33745673,
       -0.86791813, -0.7204295 ,  0.6183014 , -0.32314768, -0.82261956,
       -0.20434552, -0.6263491 ,  0.93535054,  0.7537019 , -0.40026405,
        0.09017381, -0.38332236, -0.47405103,  1.5803869 ,  1.1813339 ,
        0.79796267,  0.16633701,  0.609073  ,  0.7507337 ,  0.68391263,
        0.04651658,  0.33379614, -1.1364949 , -0.72270274, -0.28555137,
        1.1861582 , -0.5686623 ,  0.36767524, -0.30950528,  0.6826811 ,
       -0.00347298,  1.1778853 , -0.27158564, -0.9354117 , -0.41762415,
       -0.7822131 ,  0.38916755,  1.4292797 , -0.11505154,  0.48539934,
       -0.46302786, -1.720025  , -0.7341925 , -1.6413456 ,  0.03816861,
        0.1438995 , -1.593693  , -0.81107455, -0.31180072, -0.09018594,
       -0.97376084, -0.67782843,  0.04011485, -0.6208818 ,  0.70654297,
        0.8080195 , -0.17157346,  0.12769684,  0.42583132, -1.30

In [13]:
nlp('Some people have never eaten a taco').vector

numpy.ndarray

In [14]:
# We aren't printing this because it's 3 * 300 = 900 numbers
vectors = [nlp(sentence).vector for sentence in sentences]

# Print out some notes about it
print("We have", len(vectors), "different vectors")
print("And the first one has", len(vectors[0]), "measurements")
print("And the second one has", len(vectors[1]), "measurements")
print("And the third one has", len(vectors[2]), "measurements")
print("And the fourth one has", len(vectors[3]), "measurements")

We have 4 different vectors
And the first one has 96 measurements
And the second one has 96 measurements
And the third one has 96 measurements
And the fourth one has 96 measurements


In [15]:
# Compute similarities
similarities = cosine_similarity(vectors)

# Turn into a dataframe
pd.DataFrame(similarities,
            index=sentences,
            columns=sentences) \
            .style \
            .background_gradient(axis=None)

Unnamed: 0,Molly ate a donut,Molly ate a fish,Jen consumed a carp,Lenny fears the lions
Molly ate a donut,1.0,0.812,0.620459,0.158913
Molly ate a fish,0.812,1.0,0.652032,0.332671
Jen consumed a carp,0.620459,0.652032,1.0,0.55861
Lenny fears the lions,0.158913,0.332671,0.55861,1.0


In [17]:
# Here are our sentences
sentences = [
    'Veronica hates mustard. I hate you.',
    'Veronica loves ketchup',
    'Joseph hates ketchup',
]

# Turn into vectors
vectors = [nlp(sentence).vector for sentence in sentences]

# Compute similarities
similarities = cosine_similarity(vectors)

# Turn into a dataframe
pd.DataFrame(similarities,
            index=sentences,
            columns=sentences) \
            .style \
            .background_gradient(axis=None)

Unnamed: 0,Veronica hates mustard. I hate you.,Veronica loves ketchup,Joseph hates ketchup
Veronica hates mustard. I hate you.,1.0,0.54982,0.506746
Veronica loves ketchup,0.54982,1.0,0.815793
Joseph hates ketchup,0.506746,0.815793,1.0


## Testing with Gensim

In [1]:
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases #, ENGLISH_CONNECTOR_WORDS
from gensim.models import Word2Vec

import pandas as pd

In [2]:
Letter = '''My dear Madam,—I now take my pen to write a few lines, as I have been
thinking of doing so a long time. I often think of you, and I daresay you
would like to hear about the school out here. I have sixty girls that attend
nice and regularly. I used to have three girls to assist me, but one of them
has left for Queenstown, where her father has undertaken the native
school. My first class are now able to read in the third reading books, and
in compound long division sums. They already know a little geography of
this world, and they are more fond of it than I used to be. I teach them
the map of Palestine, to make them understand what they are learning. I
tell them it all in Kafir [Xhosa], and it makes it quite interesting to them.
The Kafir has all come back to me quite natural. The second class read
Letterto Miss Mackenzie 113
Peep of Day, and are in addition and subtraction sums. We have a nice
large school-room.'''


In [3]:
Letter_list = [x.replace("\n"," ").replace(".", "") for x in Letter.split(". ")]
Letter_list = [simple_preprocess(x, deacc=True, max_len=50, min_len=1) for x in Letter_list]
print(Letter_list)

[['my', 'dear', 'madam', 'i', 'now', 'take', 'my', 'pen', 'to', 'write', 'a', 'few', 'lines', 'as', 'i', 'have', 'been', 'thinking', 'of', 'doing', 'so', 'a', 'long', 'time'], ['i', 'often', 'think', 'of', 'you', 'and', 'i', 'daresay', 'you', 'would', 'like', 'to', 'hear', 'about', 'the', 'school', 'out', 'here'], ['i', 'have', 'sixty', 'girls', 'that', 'attend', 'nice', 'and', 'regularly'], ['i', 'used', 'to', 'have', 'three', 'girls', 'to', 'assist', 'me', 'but', 'one', 'of', 'them', 'has', 'left', 'for', 'queenstown', 'where', 'her', 'father', 'has', 'undertaken', 'the', 'native', 'school'], ['my', 'first', 'class', 'are', 'now', 'able', 'to', 'read', 'in', 'the', 'third', 'reading', 'books', 'and', 'in', 'compound', 'long', 'division', 'sums'], ['they', 'already', 'know', 'a', 'little', 'geography', 'of', 'this', 'world', 'and', 'they', 'are', 'more', 'fond', 'of', 'it', 'than', 'i', 'used', 'to', 'be'], ['i', 'teach', 'them', 'the', 'map', 'of', 'palestine', 'to', 'make', 'them', 

In [4]:
model = Word2Vec(sentences = Letter_list, min_count = 1, vector_size = 600, workers = 3, window = 10)

In [5]:
model.wv.key_to_index.keys()

dict_keys(['i', 'to', 'and', 'of', 'the', 'them', 'have', 'are', 'in', 'a', 'it', 'my', 'they', 'school', 'has', 'quite', 'long', 'girls', 'nice', 'now', 'read', 'class', 'used', 'you', 'me', 'all', 'kafir', 'sums', 'assist', 'three', 'take', 'but', 'madam', 'regularly', 'one', 'attend', 'that', 'dear', 'sixty', 'out', 'here', 'lines', 'thinking', 'as', 'doing', 'so', 'time', 'often', 'think', 'left', 'been', 'few', 'would', 'like', 'hear', 'about', 'write', 'pen', 'daresay', 'room', 'for', 'queenstown', 'palestine', 'make', 'understand', 'what', 'learning', 'tell', 'xhosa', 'makes', 'interesting', 'come', 'back', 'natural', 'second', 'letterto', 'miss', 'mackenzie', 'peep', 'day', 'addition', 'subtraction', 'we', 'map', 'teach', 'be', 'books', 'where', 'her', 'large', 'undertaken', 'native', 'first', 'able', 'third', 'reading', 'compound', 'than', 'division', 'already', 'know', 'little', 'geography', 'this', 'world', 'more', 'fond', 'father'])

In [7]:
model.wv["world"]

array([-1.72422806e-04, -1.35407224e-03,  1.16094342e-03,  1.27843872e-03,
       -9.38190613e-04, -1.15234521e-03, -1.50250585e-03, -1.14931187e-04,
        1.36371003e-03, -1.63611199e-03, -1.09613800e-04,  1.31192908e-03,
        7.82938092e-04, -6.58064018e-05, -5.14253392e-04,  1.00325793e-03,
       -3.97983997e-04,  2.44141964e-04, -8.36447929e-04,  7.42785924e-05,
       -3.73289280e-04, -1.60904729e-03,  9.42486513e-04, -1.74680186e-04,
       -1.31686195e-03, -3.30631621e-04,  6.14933262e-04, -3.66324530e-04,
        1.00784539e-03,  2.04706885e-04, -9.45079722e-04, -3.51231691e-04,
       -2.33610772e-04,  1.67025614e-03,  1.08399265e-03, -1.66515424e-03,
       -5.85437228e-05, -1.65538210e-03, -1.48262689e-03,  6.61775877e-04,
        9.25589295e-04,  1.49562466e-03, -1.21386314e-03, -9.83968377e-04,
        1.35951757e-03, -1.06621289e-03, -1.51796587e-04, -4.16996452e-04,
        1.54880306e-03,  8.35067593e-04, -2.29410623e-04, -1.46716393e-05,
        8.31816171e-04,  