### Document Similarity using Word Occurence

In [2]:
import pandas as pd

In [1]:
corpus = [
    "Its going to rain today",
    "I shall not be going outside",
    "We thought that would be a great idea",
    "Let me think about it about",
    "I am gonna play ukulele",
    "this is a good document ",
    "Documentation is very key in software dev",
    "its great to be a part of this team"
]

In [3]:
from sklearn.feature_extraction.text import CountVectorizer # Bow representation

In [9]:
vec = CountVectorizer(stop_words="english")
vec.fit(corpus)
vectorized_data = vec.transform(corpus)

In [13]:
pd.DataFrame(vectorized_data.toarray(), columns = vec.get_feature_names_out())

Unnamed: 0,dev,document,documentation,going,gonna,good,great,idea,key,let,outside,play,rain,shall,software,team,think,thought,today,ukulele
0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
5,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [14]:
new_sentence = "I like to be in this team"
new_sentence_array = vec.transform([new_sentence]).toarray()

In [15]:
new_sentence_array

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]],
      dtype=int64)

In [16]:
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances, cosine_similarity

In [17]:
manhattan_distances(new_sentence_array, vectorized_data)

array([[4., 4., 4., 3., 4., 3., 5., 1.]])

In [18]:
cosine_similarity(new_sentence_array, vectorized_data)

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678]])

### Document similarity using Word2Vec

In [12]:
import pandas as pd
import gensim.downloader
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
gensim.downloader.info()["models"].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

In [6]:
word2vec_300 = gensim.downloader.load("word2vec-google-news-300")

In [9]:
corpus = [
    "Its going to rain today",
    "I shall not be going outside",
    "We thought that would be a great idea",
    "Let me think about it about",
    "I am gonna play ukulele",
    "this is a good document ",
    "Documentation is very key in software dev",
    "its great to be a part of this team"
]

In [10]:
mean_vecs = [word2vec_300.get_mean_vector(line.split()) for line in corpus]

In [13]:
mean_vecs = np.array(mean_vecs)

In [14]:
mean_vecs.shape

(8, 300)

In [15]:
new_sentence = "I like guitar"
new_sentence = word2vec_300.get_mean_vector(new_sentence.split()).reshape(1,-1)

In [17]:
new_sentence.shape

(1, 300)

In [19]:
similiarity = cosine_similarity(new_sentence, mean_vecs)
corpus[np.argmax(similiarity)]

'I am gonna play ukulele'

In [20]:
cosine_similarity(mean_vecs)

array([[1.        , 0.5458617 , 0.4976383 , 0.5402243 , 0.4667031 ,
        0.4867311 , 0.41249555, 0.5250083 ],
       [0.5458617 , 0.9999999 , 0.77188164, 0.67045003, 0.6596979 ,
        0.59206176, 0.46699268, 0.59511185],
       [0.4976383 , 0.77188164, 1.0000001 , 0.7127102 , 0.5650948 ,
        0.64098555, 0.4568892 , 0.6812702 ],
       [0.5402243 , 0.67045003, 0.7127102 , 1.0000001 , 0.6308608 ,
        0.5727899 , 0.41704124, 0.5488018 ],
       [0.4667031 , 0.6596979 , 0.5650948 , 0.6308608 , 1.0000001 ,
        0.42198402, 0.33784413, 0.45884138],
       [0.4867311 , 0.59206176, 0.64098555, 0.5727899 , 0.42198402,
        0.9999999 , 0.6465095 , 0.7397194 ],
       [0.41249555, 0.46699268, 0.4568892 , 0.41704124, 0.33784413,
        0.6465095 , 0.99999994, 0.5737884 ],
       [0.5250083 , 0.59511185, 0.6812702 , 0.5488018 , 0.45884138,
        0.7397194 , 0.5737884 , 1.0000001 ]], dtype=float32)