# Installations

In [3]:
from IPython.display import clear_output

In [4]:
!pip install minisom

clear_output()

In [5]:
!pip install spacy==3.0.5

clear_output()

In [6]:
!pip install transformers

clear_output()

In [7]:

import pickle
import requests
import os
import tarfile
import torch
import spacy
import nltk
import numpy as np

from minisom import MiniSom
from gensim.utils import tokenize
from transformers import BertTokenizer, BertModel, logging
from google.colab import drive
from google.colab import files

from bokeh.models import ColumnDataSource, HoverTool
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, output_file

from matplotlib import cm
from collections import Counter
from tqdm import tqdm

# display bokeh plot in notebook
output_notebook()

logging.set_verbosity_error()

In [8]:
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
path = 'drive/MyDrive/'
with open(path+'2017', 'rb') as f_2017:
  data_2017 = pickle.load(f_2017)

with open(path+'2020', 'rb') as f_2020:
  data_2020 = pickle.load(f_2020)

with open(path+'2010', 'rb') as f_2010:
  data_2010 = pickle.load(f_2010)

with open(path+'shakespeare.txt', 'r') as f_shakespeare:
  shakespeare = f_shakespeare.read()

In [10]:
STOPWORDS_FILE = 'stopwords.txt'


def download_model():

    filename = 'en_core_web_md_temporary'
    if not os.path.exists(filename):
        r = requests.get('https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0'
                         '/en_core_web_md-3.0.0.tar.gz', allow_redirects=True)
        open(filename, 'wb').write(r.content)
        tar = tarfile.open(filename, 'r:gz')
        tar.extractall()
        tar.close()


def download_stopwords():
  filename = STOPWORDS_FILE
  if not os.path.exists(filename):
        r = requests.get('https://github.com/DinarZayahov/thesaurus/releases/download/0.0.1/extended_stopwords.txt', allow_redirects=True)
        open(filename, 'wb').write(r.content)

In [11]:
download_model()
download_stopwords()

In [12]:
nltk.download('words')
nltk.download('wordnet')

clear_output()

# BERT Functions

In [13]:
model = BertModel.from_pretrained('bert-base-uncased',
           output_hidden_states = True)

model.eval()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors


def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2]

    token_embeddings = torch.stack(hidden_states, dim=0)

    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    token_embeddings = token_embeddings.permute(1,0,2)



    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_vecs_sum]

    return list_token_embeddings


def merge_vectors(list_of_vectors):
  return np.mean(list_of_vectors, axis=0)


def merge_subwords(list_of_subwords):
  res = list_of_subwords[0]
  for subword in list_of_subwords[1:]:
    res += subword[2:]
  return res


def get_stopwords(path):
  stopwords_file = open(path, 'r')
  stopwords = []
  for line in stopwords_file:
      stopwords.append(line[:-1])
  return stopwords


def f(sentences):
  all_tokenized_sentences = []
  all_embeddings = []

  # for sentence in tqdm(sentences):
  for sentence in sentences:
      tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
      list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

      all_tokenized_sentences += tokenized_text
      all_embeddings += list_token_embeddings

  return all_tokenized_sentences, all_embeddings


def merge(tokens, embeddings):
  ranges = []
  i = -1
  while i >= -len(tokens):
    if tokens[i][:2] == "##":
      start = i
      while tokens[i][:2] == "##":
        i -= 1
      end = i
      ranges.insert(0, (start, end))
    else:
      i -= 1

  for r in ranges:
    tokens = tokens[-len(tokens):r[1]] + [merge_subwords(tokens[r[1]:r[0]+1])] + tokens[r[0]+1:]
    embeddings = embeddings[-len(embeddings):r[1]] + [merge_vectors(embeddings[r[1]:r[0]+1])] + embeddings[r[0]+1:]

  return tokens, embeddings


def filter(tokens, embeddings):
  stopwords = get_stopwords(STOPWORDS_FILE)

  filtered_tokens = []
  filtered_embeddings = []
  for i in range(len(tokens)):
    if tokens[i] not in ['[SEP]', '[CLS]'] and tokens[i].isalpha() and tokens[i] not in stopwords:
      filtered_tokens.append(tokens[i])
      filtered_embeddings.append(embeddings[i])

  return filtered_tokens, filtered_embeddings

def bert_embedding(token):
  a, b = f([token])
  a, b = merge(a, b)
  a, b = filter(a, b)
  emb = b[0]
  return emb[:300]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

# Thesaurus Functions

In [26]:
MAX_LENGTH = 50000000
LEMMATIZATION_THRESHOLD = 500000


class Thesaurus:
    def __init__(self):
        self.spacy_model = None

    @staticmethod
    def read_text(file):
        lines = []
        for line in file:
            # line = line.decode('utf-8', 'ignore')
            lines.append(line)
        return ''.join(lines)

    def set_spacy_model(self, model):
        self.spacy_model = spacy.load(model)
        self.spacy_model.max_length = MAX_LENGTH


    def get_nes(self, text):
      doc = self.spacy_model(text)
      nes = []
      for word in doc.ents:
        if word.label_ in ['ORG', 'GPE', 'PERSON']:
          nes.append(word.text)
      return list(dict.fromkeys(nes))


    def lemmatize(self, text, length):
        if length < LEMMATIZATION_THRESHOLD:
            doc = self.spacy_model(text)
            result = " ".join([token.lemma_ for token in doc])
            return result
        else:
            for doc in self.spacy_model.pipe([text], batch_size=32, n_process=3, disable=["parser", "ner"]):
                result = " ".join([token.lemma_ for token in doc])
                return result


    @staticmethod
    def tokenize(text):
        tokens = list(tokenize(text, to_lower=True))
        return tokens

    @staticmethod
    def get_stopwords(path):
        stopwords_file = open(path, 'r')
        stopwords = []
        for line in stopwords_file:
            stopwords.append(line[:-1])
        return stopwords

    def remove_stopwords(self, tokens: list):
        stopwords = self.get_stopwords(STOPWORDS_FILE)
        filtered_tokens = []
        for token in tokens:
            if token not in stopwords:
                filtered_tokens.append(token)
        return filtered_tokens, list(dict.fromkeys(filtered_tokens))


    def preprocess(self, tokens):
      words = set(nltk.corpus.words.words())

      result = []

      for token in tokens:
        if (token not in words) or (not token.isalpha()) or (len(token) <= 2):
          continue
        else:
          result.append(token)

      return result


    def make_embeddings(self, tokens: list) -> list:
        embeddings_filename = 'embeddings.pickle'
        if os.path.exists(embeddings_filename):
            # print('Found cache..')
            embeddings_file = open(embeddings_filename, 'rb')
            changed = False
            dictionary = pickle.load(embeddings_file)
            result = []
            for token in tokens:
                if token in dictionary:
                    result.append(dictionary[token])
                else:
                    e = self.spacy_model(token).vector
                    dictionary[token] = e
                    changed = True
                    result.append(e)
            if changed:
                # print('Rewriting cache..')
                embeddings_file.close()
                os.remove(embeddings_filename)
                new_embeddings_file = open(embeddings_filename, 'wb')
                pickle.dump(dictionary, new_embeddings_file)
            return result
        else:
            # print('Cache not found..')
            dictionary = dict()
            for token in tokens:
                dictionary[token] = self.spacy_model(token).vector
            embeddings_file = open(embeddings_filename, 'wb')
            pickle.dump(dictionary, embeddings_file)
            return list(dictionary.values())

    @staticmethod
    def get_grid_size(n):
        neurons_num = 5*np.sqrt(n)
        return int(np.ceil(np.sqrt(neurons_num)))

    @staticmethod
    def get_som(grid_size, embeddings_b, mode='load'):
      sigma = 2
      lr = 5
      iters = 50000


      if mode == 'train':
        som = MiniSom(grid_size, grid_size, len(embeddings_b[0]), sigma=sigma, learning_rate=lr,
                      activation_distance='euclidean', topology='hexagonal', neighborhood_function='gaussian',
                      random_seed=10)

        som.train(embeddings_b, iters, verbose=False)

        with open(path+'som.pickle', 'wb') as som_file:
          pickle.dump(som, som_file)

        files.download(path+'som.pickle')
      else:
        model = open(path+'som.pickle', 'rb')
        som = pickle.load(model)

      return som


    def plot_bokeh(self, embeddings_b, filtered_btext_set, foreground_names, preprocessed_foregrounds, background_color='#d2e4f5', foreground_colors=['#f5a09a', 'green']):

      """
      foreground_names ['foreground_name1', ...]
      preprocessed_foregrounds: {'foreground_name1': {'embeds': [...], 'words': [...]}, ...]
      """


      HEXAGON_SIZE = 10
      DOT_SIZE = 4

      GRID_SIZE = int(np.ceil(np.sqrt(len(embeddings_b))))
      # print(GRID_SIZE)

      PLOT_SIZE = HEXAGON_SIZE * (GRID_SIZE + 1)

      som = self.get_som(GRID_SIZE, embeddings_b)



      if os.path.isfile(path+'index.pickle'):
        with open(path+'index.pickle', 'rb') as index_file:
          index = pickle.load(index_file)

        b_label = []
        
        b_weight_x, b_weight_y = [], []
        for cnt, i in enumerate(embeddings_b):

            w = index[filtered_btext_set[cnt]]

            wx, wy = som.convert_map_to_euclidean(xy=w)
            wy = wy * np.sqrt(3) / 2
            b_weight_x.append(wx)
            b_weight_y.append(wy)
            b_label.append(filtered_btext_set[cnt])

      else:
        index = dict()

        b_label = []
        
        b_weight_x, b_weight_y = [], []
        for cnt, i in enumerate(embeddings_b):
            w = som.winner(i)
            index[filtered_btext_set[cnt]] = w
            wx, wy = som.convert_map_to_euclidean(xy=w)
            wy = wy * np.sqrt(3) / 2
            b_weight_x.append(wx)
            b_weight_y.append(wy)
            b_label.append(filtered_btext_set[cnt])

        with open(path+"index.pickle", 'wb') as index_file:
          pickle.dump(index, index_file)

      translations = [(-0.15, -0.15), (0.15, 0.15), (-0.15, 0.15)]

      for foreground_unit in foreground_names:
        label = []
        weight_x, weight_y = [], []

        fu = preprocessed_foregrounds[foreground_unit]

        for cnt, i in enumerate(fu['embeds']):
          if fu['words'][cnt] in index:
            w = index[fu['words'][cnt]]
          else:
            w = som.winner(i)
          wx, wy = som.convert_map_to_euclidean(xy=w)
          wy = wy * np.sqrt(3) / 2
          weight_x.append(wx)
          weight_y.append(wy)
          label.append(fu['words'][cnt])

        fu['label'] = label
        fu['weight_x'] = weight_x
        fu['weight_y'] = weight_y


      fig = figure(plot_height=PLOT_SIZE, plot_width=PLOT_SIZE,
                    match_aspect=True,
                    tools="pan, wheel_zoom, reset, save")

      fig.axis.visible = False
      fig.xgrid.grid_line_color = None
      fig.ygrid.grid_line_color = None

      # create data stream for plotting
      b_source_pages = ColumnDataSource(
          data=dict(
              wx=b_weight_x,
              wy=b_weight_y,
              species=b_label
          )
      )

      all_weights = []
      for foreground_unit in foreground_names:
        fu = preprocessed_foregrounds[foreground_unit]
        temp = [(fu['weight_x'][i], fu['weight_y'][i]) for i in range(len(fu['weight_x']))]
        all_weights += temp

        temp_c = Counter(temp)
        fu['local_counts'] = temp_c


      all_weights_count = Counter(all_weights)

      for foreground_unit in foreground_names:
        fu = preprocessed_foregrounds[foreground_unit]

        translation = translations.pop(0)

        hex = {'weight_x': [], 'weight_y': [], 'label': [], 'size': []}
        for i in range(len(fu['weight_x'])):
          coords = (fu['weight_x'][i], fu['weight_y'][i])
          if all_weights_count[coords] - fu['local_counts'][coords] > 0:
            hex['weight_x'].append(fu['weight_x'][i]+translation[0])
            hex['weight_y'].append(fu['weight_y'][i]+translation[1])
            hex['size'].append(DOT_SIZE)
          else:
            hex['weight_x'].append(fu['weight_x'][i])
            hex['weight_y'].append(fu['weight_y'][i])
            hex['size'].append(HEXAGON_SIZE)
          hex['label'].append(fu['label'][i])

        hex_pages = ColumnDataSource(
          data=dict(
              wx= hex['weight_x'],
              wy= hex['weight_y'],
              species=hex['label'],
              size=hex['size']
          )
      )
        fu['hex_pages'] = hex_pages


      fig.hex(x='wy', y='wx', source=b_source_pages,
              fill_alpha=0.2, fill_color=background_color,
              line_alpha=1.0, line_color=background_color, line_width=1,
              size=HEXAGON_SIZE, name="one",
              legend_label='Background')



      for foreground_unit in foreground_names:
        fu = preprocessed_foregrounds[foreground_unit]
        current_color = foreground_colors.pop(0)
        fig.hex(x='wy', y='wx', source=fu['hex_pages'],
                  fill_color=current_color, 
                  line_width=0.1,
                  size='size', name="two",
                legend_label=foreground_unit)


      fig.legend.location = "top_left"
      fig.add_layout(fig.legend[0], 'right')
      fig.legend.click_policy="hide"
      fig.add_tools(HoverTool(
          tooltips=[
              ("label", '@species')],
          mode="mouse",
          attachment='above',
          point_policy="follow_mouse",
          names=["one", "two", "three"]
      ))

      return fig, som 


    def process_texts(self, texts):

      all_embeddings = []
      all_words = []

      for source_text in tqdm(texts):

          lemmatized_text = self.lemmatize(source_text, len(source_text))

          tokenized_text = self.tokenize(lemmatized_text)

          filtered_tokens, filtered_tokens_set = self.remove_stopwords(tokenized_text)

          new = []
          cb = Counter(filtered_tokens)
          occurrence = 1
          for tok in filtered_tokens_set:
              if cb[tok] == occurrence:
                  new.append(tok)

          processed_tokens_set = self.preprocess(new)

          embeddings = self.make_embeddings(processed_tokens_set)

          for i in range(len(processed_tokens_set)):
              if processed_tokens_set[i] not in all_words:
                  if np.any(embeddings[i]):
                      all_embeddings.append(embeddings[i])
                      all_words.append(processed_tokens_set[i])
                  else:
                      all_embeddings.append(bert_embedding(processed_tokens_set[i]))
                      all_words.append(processed_tokens_set[i])

      return all_embeddings, all_words

# Preprocess foreground

In [21]:
# texts is the dictionary with all source texts that will be preprocessed
# texts = {'foreground name1': [source text1, source text2, ...], 'foreground name2': [source text1, source text2, ...], ...}
foreground_texts = dict()

# list of foreground names
# foreground_names = ['foreground name1', ...]
foreground_names = []

# dictionary with the embeddings and tokens of each foreground unit
# processed_foregrounds = {'foreground_name1': {'embeds': embeddings, 'words': tokens}, ...}
processed_foregrounds = dict()

In [22]:
# preprocess Physics articles
foreground_name1 = 'Physics articles 2017'

foreground_texts[foreground_name1] = []

foreground_names.append(foreground_name1)

num_of_articles = 10
for physics_article in data_2017[:num_of_articles]:
  try:
    foreground_texts[foreground_name1].append(physics_article['clean'])
  except KeyError:
    continue

In [23]:
# preprocess Shakespeare's poem
foreground_name2 = 'Lover\'s Complaint by William Shakespeare'

foreground_texts[foreground_name2] = [shakespeare]

foreground_names.append(foreground_name2)

In [28]:
MODEL = 'en_core_web_md-3.0.0/en_core_web_md/en_core_web_md-3.0.0'

obj = Thesaurus()
obj.set_spacy_model(MODEL)

In [29]:
for foreground_unit in tqdm(foreground_names):

  all_embeddings_of_unit, all_words_of_unit = obj.process_texts(foreground_texts[foreground_unit])

  one_processed_foreground = {'embeds': all_embeddings_of_unit, 'words': all_words_of_unit}
  processed_foregrounds[foreground_unit] = one_processed_foreground

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:04<00:36,  4.03s/it][A
 20%|██        | 2/10 [00:09<00:40,  5.02s/it][A
 30%|███       | 3/10 [00:12<00:27,  3.90s/it][A
 40%|████      | 4/10 [00:14<00:18,  3.03s/it][A
 50%|█████     | 5/10 [00:16<00:14,  2.88s/it][A
 60%|██████    | 6/10 [00:22<00:15,  3.87s/it][A
 70%|███████   | 7/10 [00:24<00:10,  3.45s/it][A
 80%|████████  | 8/10 [00:26<00:05,  2.97s/it][A
 90%|█████████ | 9/10 [00:28<00:02,  2.66s/it][A
100%|██████████| 10/10 [00:30<00:00,  3.02s/it]
 50%|█████     | 1/2 [00:30<00:30, 30.24s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:05<00:00,  5.27s/it]
100%|██████████| 2/2 [00:35<00:00, 17.76s/it]


# Preprocess background

In [30]:
background_texts = []

In [31]:
if os.path.isfile(path+'coca_embeds.pickle') and os.path.isfile(path+'coca_tokens.pickle'):
  embeds = open(path+'coca_embeds.pickle', 'rb')
  background_embeds = pickle.load(embeds)

  tokens = open(path+'coca_tokens.pickle', 'rb')
  background_words = pickle.load(tokens)
else:

  background_embeds, background_words = obj.process_texts(background_texts)

# Demo

In [None]:
# three color depth levels, light, middle, dark color -> kinda differentiable on grayscale
# physics 2017 10 articles vs shakespeare
# 30 seconds

In [32]:
fig, som = obj.plot_bokeh(background_embeds, background_words, foreground_names, processed_foregrounds)
show(fig)

In [None]:
# not differentiable on grayscale but colors not splashy (пёстрый)
# physics: 2010 vs 2020. 100 articles from each year
# 3 mins

In [None]:
fig, som = obj.plot_bokeh(background_embeds, background_words, foreground_names, processed_foregrounds)
show(fig)

151


# Search

In [74]:
def search(som, fig, words, embeds, search_word, search_color='blue'):
  
  try:
    index = words.index(search_word)


    label = []

    weight_x, weight_y = [], []
    
    w = som.winner(embeds[index])
    wx, wy = som.convert_map_to_euclidean(xy=w)
    wy = wy * np.sqrt(3) / 2
    weight_x.append(wx)
    weight_y.append(wy)
    label.append(search_word)

    source_pages = ColumnDataSource(
        data=dict(
            wx=weight_x,
            wy=weight_y,
            species=label
        )
    )

    point = fig.scatter(x='wy', y='wx', source=source_pages,
                  line_width=0.1, fill_color=search_color, size=4)
    circle = fig.scatter(x='wy', y='wx', source=source_pages,
                  line_color=search_color, line_width=1, line_alpha=1, 
                  fill_alpha=0, 
                  size=160)
    
    show(fig)

    return point, circle

  except ValueError:
    print('No such a word in map')

    return None, None

point, circle = None, None

In [76]:
if point != None and circle != None:
  point.visible, circle.visible = False, False
point, circle = search(som, fig, background_words, background_embeds, 'overload')