# Cosine Similarity (Version C)

In [1]:
# Configuration

# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-02-10-Wikipedia-Texts/
source_texts_directory = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/"
# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-04-07-Wikipedia-Embeddings/
embeddings_directory  = "/home/eml4u/EML4U/data/wikipedia-embeddings/"

# points of time
id_a = "20100408"
id_b = "20201101"
# category ids
id_american = "american-films"
id_british  = "british-films"
id_indian   = "indian-films"
# file ids
id_american_a = id_a + "-" + id_american
id_american_b = id_b + "-" + id_american
id_british_a  = id_a + "-" + id_british
id_british_b  = id_b + "-" + id_british
id_indian_a   = id_a + "-" + id_indian
id_indian_b   = id_b + "-" + id_indian


# Imports

import numpy
print("numpy:   " + numpy.version.version)

import sklearn
import sklearn.metrics
print("sklearn: " + sklearn.__version__)

# Class instance to access data (wp texts, pre-computed embeddings)
import data_access
data_accessor = data_access.DataAccess(source_texts_directory, embeddings_directory)

numpy:   1.19.2
sklearn: 0.23.2


# 1. Average embeddings

Compute average embeddings for 2 points in time. The results will be a 768-dimensional vector for each point in time.  
→ Get texts compared to the average vectors.

→ Get typical texts    
* One vector of old point in time $\bar{v_{t1}}$, one vector new point in time $\bar{v_{t2}}$  
* Between: CosSim  

In [2]:
# Load embeddings

embeddings_british_a = data_accessor.load_embeddings(id_british_a)
embeddings_british_b = data_accessor.load_embeddings(id_british_b)
print()


# Compute means / average embeddings

def get_mean(embeddings, note = "", printinfo = True):
    mean = numpy.mean(embeddings, axis=0)
    if printinfo:
        print(str(type(mean)) + " " + str(mean.shape) + " " +  note)
    return mean

print("Average embeddings for 2 points in time:")
mean_british_a  = get_mean(embeddings_british_a, "BritishA")
mean_british_b  = get_mean(embeddings_british_b, "BritishB")

/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-british-films.txt
(2147, 768) <class 'numpy.ndarray'>
/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-british-films.txt
(2147, 768) <class 'numpy.ndarray'>

Average embeddings for 2 points in time:
<class 'numpy.ndarray'> (768,) BritishA
<class 'numpy.ndarray'> (768,) BritishB


In [3]:
# Cosine similarity

def get_pairwise_cosine_similarity(a, b, note = "", printinfo = True):
    if printinfo:
        print(str(type(a)) + " " + str(a.shape) + "\n" + str(type(b)) + " " + str(b.shape))
    cosSim = sklearn.metrics.pairwise.cosine_similarity(a, b, dense_output=True)[0][0]
    if printinfo:
        print(str(cosSim) + " " + note)
    return cosSim

print("Cosine similarity of average embeddings for 2 points in time:")
british_mean_cosine = get_pairwise_cosine_similarity(mean_british_a.reshape(1, -1), mean_british_b.reshape(1, -1), "British")

Cosine similarity of average embeddings for 2 points in time:
<class 'numpy.ndarray'> (1, 768)
<class 'numpy.ndarray'> (1, 768)
0.9936639191853995 British


In [4]:
# Texts compared to the average vectors

print("Typical texts:")
similarities_british_a = []
for i in range(len(mean_british_a)):
    similarities_british_a.append((i, get_pairwise_cosine_similarity(  mean_british_a.reshape(1, -1), embeddings_british_a[i].reshape(1, -1), "", False  )))
similarities_british_a = sorted(similarities_british_a, key=lambda tup: tup[1], reverse=False)
print(len(similarities_british_a), similarities_british_a[0:5])

similarities_british_b = []
for i in range(len(mean_british_b)):
    similarities_british_b.append((i, get_pairwise_cosine_similarity(  mean_british_b.reshape(1, -1), embeddings_british_b[i].reshape(1, -1), "", False  )))
similarities_british_b = sorted(similarities_british_b, key=lambda tup: tup[1], reverse=False)
print(len(similarities_british_b), similarities_british_b[0:5])

Typical texts:
768 [(721, 0.8095643688026439), (333, 0.8195543132287988), (680, 0.820176887350935), (663, 0.8234479285109668), (406, 0.834516265316151)]
768 [(680, 0.7901596223863818), (333, 0.8048592466102065), (179, 0.806587230956004), (381, 0.8126745126594594), (245, 0.8173266825145751)]


In [5]:
# Print source texts
def print_source_text(directory, category_id, index):
    print()
    print("Category: " + category_id)
    print("Index:    " + str(index))
    file = data_accessor.get_embeddings_dict_filename(category_id, index);
    print("File:     ")
    print(data_accessor.read_source_text(directory, file))
    print()

if False:
    print_source_text(id_british_b, id_british, similarities_british_b[0][0])
    print_source_text(id_british_b, id_british, 680)

### 2. Compare each document embedding $v_{t2i}$ (of every wp article) at $t2$ with $\bar{v_{t2}}$ using CosSim.  

* Get WP articles with largest distance to mean-vector $\bar{v_t2}$.
* Optional: For article with largest distance, check attention and highlight words with largest attention  
e.g. Integrated Gradients for text https://github.com/SeldonIO/alibi
* Check plotting + word counts (end of file) https://github.com/EML4U/Topic-Modeling/blob/main/Twitter%20test.ipynb 

In [6]:
# Get articles with largest distance to v_t2
# Distance: Smallest cosine similarity
# -> See similarities_british_b

In [7]:
# 100 articles with largest distance to mean vector B
distant_british_b = similarities_british_b[0:100]
print(distant_british_b)

[(680, 0.7901596223863818), (333, 0.8048592466102065), (179, 0.806587230956004), (381, 0.8126745126594594), (245, 0.8173266825145751), (255, 0.8246783158761353), (406, 0.8461331874787608), (663, 0.8509946040235267), (213, 0.8514522727454383), (483, 0.862620813656519), (526, 0.8629482931038279), (101, 0.8641558058376334), (720, 0.8643496013779346), (332, 0.8671766380114813), (605, 0.8679861756571845), (574, 0.8692133375218205), (385, 0.8703120437107627), (400, 0.8710870542779425), (272, 0.8747334865889536), (591, 0.8748916853566279), (295, 0.8764491348254336), (674, 0.880598769307674), (747, 0.8808114412749424), (174, 0.8836400509311044), (259, 0.8854263498204921), (355, 0.8855939571431497), (304, 0.8872719351437507), (764, 0.8881581364107067), (216, 0.8881797777193562), (23, 0.8894833788108552), (290, 0.890238506109883), (183, 0.8910271068032816), (522, 0.8910528699982174), (240, 0.8925691265135722), (727, 0.8929515843282431), (164, 0.8945786418679671), (17, 0.8953558750795475), (545, 