## Agenda
1. Create the vocabulary of all unique terms (each of them will be a dimension)
2. Represent each document and the query in the vector space created by these terms
3. Calculate the cosine similarity between the query and each document
4. Rank the results based on the cosine similarity


In [1]:
#pip install numpy

In [2]:
#pip install --upgrade nbformat

In [3]:
#pip install jupyter

In [4]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
import matplotlib.pyplot as plt
import plotly.express as px

### Generate Pandas Data Frame for the Sentence Vectors

In [5]:
voc = ['a', 'and', 'dress', 'earrings', 
       'has', 'i', 'in', 'is', 'lipstick', 
       'my', 'new', 'photo', 'red', 'resembles', 
       'she', 'short', 'stain', 'the', 'tomorrow', 
       'wear', 'wearing', 'will', 'wine', 'wore']
d1 = 'she wore a dress and red earrings'
d1 = d1.split(" ")
d2 = 'the dress has a red wine stain'
d2 = d2.split(" ")
d3 = 'tomorrow I will wear my new red dress'
d3 = d3.split(" ")
d4 = 'the red dress in the photo resembles the red dress she is wearing'
d4 = d4.split(" ")
d5 = 'short dress'
d5 = d5.split(" ")
d6 = 'my red lipstick'
d6 = d6.split(" ")
query = 'red dress'
q = query.split(" ")

In [6]:
def count(d):
    """
    Count the number of times a word appear in the sentence
    """
    dic = {}
    for word in d:
        if word in dic.keys():
            dic[word] += 1
        else:
            dic[word] = 1
    return dic

In [7]:
def assign01 (d, voc):
    """
    Generate vectors from the sentences
    """
    ls = []# empty list for vector representation
    num = count(d) #number of times a word appear in d
    for v in voc:
        if v in d:
            ls.append(num[v])
        else:
            ls.append(0)
    return ls
print(assign01(d1, voc))

[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [8]:
df = pd.DataFrame(index =['d1','d2','d3','d4','d5','d6'], columns = voc)
#df['sentence'] = ['d1','d2','d3','d4','d5','d6','d7']
df.loc['d1'] = assign01(d1, voc)
df.loc['d2'] = assign01(d2, voc)
df.loc['d3'] = assign01(d3, voc)
df.loc['d4'] = assign01(d4, voc)
df.loc['d5'] = assign01(d5, voc)
df.loc['d6'] = assign01(d6, voc)
q = assign01(q, voc)

In [9]:
# change pandas into numpy array
df.loc['d1'].to_numpy()

array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1], dtype=object)

In [10]:
def cosineSimilarity(vec1, vec2):
    """
    Calculate the cosine similarity between the two vectors
    """
    v1 = np.array(vec1)
    v2 = np.array(vec2)
    cosine = np.dot(v1,v2)/(norm(v1)*norm(v2))
    return cosine

In [11]:
# test
cosineSimilarity(df.loc['d1'].to_numpy(),df.loc['d2'].to_numpy())

0.4285714285714285

### Generate Heat Map

In [12]:
data = ['d1','d2','d3','d4','d5','d6']
heatmap_data = {}
for d1 in data:
    ls_val = []
    for d2 in data:
        ls_val.append(cosineSimilarity(df.loc[d1].to_numpy(),df.loc[d2].to_numpy()))
    heatmap_data[d1] = ls_val
heatmap_data

{'d1': [0.9999999999999999,
  0.4285714285714285,
  0.2857142857142857,
  0.39405520311955033,
  0.26726124191242434,
  0.2182178902359924],
 'd2': [0.4285714285714285,
  0.9999999999999999,
  0.2857142857142857,
  0.5516772843673704,
  0.26726124191242434,
  0.2182178902359924],
 'd3': [0.2857142857142857,
  0.2857142857142857,
  0.9999999999999999,
  0.31524416249564025,
  0.26726124191242434,
  0.4364357804719848],
 'd4': [0.39405520311955033,
  0.5516772843673704,
  0.31524416249564025,
  1.0000000000000002,
  0.29488391230979427,
  0.24077170617153845],
 'd5': [0.26726124191242434,
  0.26726124191242434,
  0.26726124191242434,
  0.29488391230979427,
  0.9999999999999998,
  0.0],
 'd6': [0.2182178902359924,
  0.2182178902359924,
  0.4364357804719848,
  0.24077170617153845,
  0.0,
  1.0000000000000002]}

In [13]:
#check dimensions
dff = pd.DataFrame(heatmap_data)
dff.shape

(6, 6)

In [14]:
sentences = ['she wore a dress and red earrings'[:25],
             'the dress has a red wine stain'[:25],
             'tomorrow I will wear my new red dress'[:25],
             'the red dress in the photo resembles the red dress she is wearing'[:25],
             'short dress'[:25],
             'my red lipstick'[:25]]
fig = px.imshow([ii for ii in heatmap_data.values()], 
                 color_continuous_scale=px.colors.sequential.YlOrRd, # the color palette name
                x=sentences,
                y=sentences,)
#fig.update_xaxes(side="top")

fig.show()

