In [None]:
import numpy as np
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

In [None]:
class Docs_similarity():
    
    def __init__(self):
        print('Instance created')

    def cosine(v1, v2):
        v1, v2 = np.array(v1), np.array(v2)
        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    
    def document_to_avg_embedding(doc, embeddings):
        
            doc_embeddings = []
            doc = str.lower(doc)
            tokens = word_tokenize(doc)

            for token in tokens:

                try:
                    if token in embeddings.keys():
                        doc_embeddings.append(embeddings[token])
                except AttributeError as e:
                    if token in embeddings.key_to_index.keys():
                        doc_embeddings.append(embeddings[token])
            
            avg_embedding = np.mean(np.array(doc_embeddings), axis=0)

            return avg_embedding
        
    def cosine_sim_between_docs(self, doc1, doc2, embeddings, verbose=False):

        v1 = self.document_to_avg_embedding(doc1, embeddings)
        v2 = self.document_to_avg_embedding(doc2, embeddings)

        # If there is no embedings for a doc, return 0.0 (a != a returns true for NaNs).
        try: any(v1 != v1)
        except TypeError:
            if verbose: print(f'Warning cannot find embedding for {doc1}.')
            return 0.0
        
        try: any(v2 != v2)
        except TypeError:
            if verbose: print(f'Warning cannot find embedding for {doc2}.')
            return 0.0
        
        cosine_sim = self.cosine(v1, v2)

        if verbose:
            print(f'Cosine similarity between {doc1} and {doc2} is:\n{cosine_sim}\n')

        return cosine_sim
    
    def calculate_similarity_matrix(cadec_sample, mapping):
        similarity_matrix = np.zeros((5,5))
        count_matrix = np.zeros((5,5))

        for index_i, row_i in cadec_sample.iterrows():
            for index_j, row_j in cadec_sample.iterrows():
                
                # Do not calculate simmilarities for the same entities
                # to not overestimate the score within the same group.
                if index_i != index_j:
                    similarity_matrix[row_i.entity_type][row_j.entity_type] += cosine_sim_between_docs(row_i.text, row_j.text, mapping, verbose=False)
                    count_matrix[row_i.entity_type][row_j.entity_type] += 1

        similarity_matrix /= count_matrix
        return similarity_matrix
    
    def plot_similarity_matrix(similarity_matrix):
    
        plt.imshow(similarity_matrix, vmin=-1, vmax=1, extent=[0, 5, 0, 5]) 
        for i in range(5): 
            for j in range(5): 
                plt.annotate(str(round(similarity_matrix[i][j], 3)),
                            xy=(j+0.5, i+0.7), 
                            ha='center', va='center', color='white') 
                
        plt.show()

    