# Cosine Similarity - Approach B

- Loads embeddings (pre-computed from texts)
- Computes **mean of embedding**-arrays
- Computes **cosine-similarity of embedding-means** of two points of time
- ...

In [1]:
import numpy
print("numpy:   " + numpy.version.version)

numpy:   1.19.2


In [2]:
# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-02-10-Wikipedia-Texts/
source_texts_directory = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/"
# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-04-07-Wikipedia-Embeddings/
embeddings_directory  = "/home/eml4u/EML4U/data/wikipedia-embeddings/"

# points of time
id_a = "20100408"
id_b = "20201101"
# category ids
id_american = "american-films"
id_british  = "british-films"
id_indian   = "indian-films"
# file ids
id_american_a = id_a + "-" + id_american
id_american_b = id_b + "-" + id_american
id_british_a  = id_a + "-" + id_british
id_british_b  = id_b + "-" + id_british
id_indian_a   = id_a + "-" + id_indian
id_indian_b   = id_b + "-" + id_indian

In [3]:
# Class instance to access data (wp texts, pre-computed embeddings)
import data_access
data_accessor = data_access.DataAccess(source_texts_directory, embeddings_directory)

In [4]:
# Compute means
def getMean(embeddings, note = "", printinfo = True):
    mean = numpy.mean(embeddings, axis=0)
    if printinfo:
        print(str(type(mean)) + " " + str(mean.shape) + " " +  note)
    return mean

In [5]:
# Differences of arrays as one value
def differenceValue(a, b):
    x = 0
    for i in range(len(a)):
        x += abs(a[i] - b[i])
    return x;

# Test
if False:
    print(differenceValue(numpy.array([1,2,3,4]), numpy.array([1.1,2.2,3.3,4.4])))

In [6]:
# Load embeddings
embeddingsBritishA = data_accessor.load_embeddings(id_british_a)
embeddingsBritishB = data_accessor.load_embeddings(id_british_b)

# Compute means
meanBritishA  = getMean(embeddingsBritishA, "BritishA")
meanBritishB  = getMean(embeddingsBritishB, "BritishB")

/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-british-films.txt
(2147, 768) <class 'numpy.ndarray'>
/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-british-films.txt
(2147, 768) <class 'numpy.ndarray'>
<class 'numpy.ndarray'> (768,) BritishA
<class 'numpy.ndarray'> (768,) BritishB


In [7]:
# Compute differences
# Array (index, difference to mean t1, difference to mean t2)
differences = []
for i in range(len(meanBritishA)):
    differences.append((i, differenceValue(meanBritishA, embeddingsBritishA[i]), differenceValue(meanBritishB, embeddingsBritishB[i])))

# Sort by differences
# Arrays (index, difference to mean t1, difference to mean t2)
# Sorted by similarity to A / t1
differencesA = sorted(differences, key=lambda tup: tup[1])
# Sorted by similarity to B / t2
differencesB = sorted(differences, key=lambda tup: tup[2])

In [8]:
# Print largest differences
if True:
    print(differencesA[len(differencesA) - 1])
    print(differencesA[len(differencesA) - 2])
    print()
    print(differencesB[len(differencesB) - 1])
    print(differencesB[len(differencesB) - 2])

(721, 127.63656949018497, 63.07215986661895)
(333, 126.23330120916125, 126.63447396310393)

(179, 114.6978323360855, 131.07256570494872)
(680, 122.11133584967376, 129.0560627009414)
