In [1]:
import torch
from transformers import BertTokenizer, BertModel

model_version = 'models/scibert_scivocab_uncased'
do_lower_case = True
model = BertModel.from_pretrained(model_version)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

Some weights of the model checkpoint at models/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

def embed_text(text, model):
    input_ids = torch.tensor(tokenizer.encode(text, truncation=True, max_length=512)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states

def get_similarity(em, em2):
    return cosine_similarity(em.detach().numpy(), em2.detach().numpy())

In [4]:
# We will use a mean of all word embeddings. To do that we will take mean over dimension 1 which is the sequence length.
coronavirus_em = embed_text("Coronavirus", model).mean(1)
print(coronavirus_em.dtype)
mers_em = embed_text("Middle East Respiratory Virus", model).mean(1)
flu_em = embed_text("Flu", model).mean(1)
bog_em = embed_text("Bog", model).mean(1)
covid_2019 = embed_text("COVID-2019", model).mean(1)
print("Similarity for Coronavirus and Flu:" + str(get_similarity(coronavirus_em, flu_em)))
print("Similarity for Coronavirus and MERs:" + str(get_similarity(coronavirus_em, mers_em)))
print("Similarity for Coronavirus and COVID-2019:" + str(get_similarity(coronavirus_em, covid_2019)))
print("Similarity for Coronavirus and Bog:" + str(get_similarity(coronavirus_em, bog_em)))

torch.float32
Similarity for Coronavirus and Flu:[[0.685802]]
Similarity for Coronavirus and MERs:[[0.77256453]]
Similarity for Coronavirus and COVID-2019:[[0.7233723]]
Similarity for Coronavirus and Bog:[[0.6451756]]


In [5]:
import umap.umap_ as umap
reducer = umap.UMAP()

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [5]:
import pandas as pd
# from DataCollectorClient.src.services.zenodo_parser import ZenodoParser

# parser = ZenodoParser()

# dat = parser.load("DataCollectorClient/src/services/pickle_test_all_with_files_sparse.pickle").data

title_and_description_metadata = pd.read_csv("title_description_zenodo.csv").head(20).values

# dat[["title", "description"]].to_csv("title_description_zenodo.csv", index=False)

In [7]:


def make_data_embedding(title_description_metadata, method="mean", dim=1):
    for i in range(len(title_description_metadata)):
        description = title_description_metadata[i, 1]
        text = embed_text(description, model)
        if method == "mean":
            title_description_metadata[i, 1] = text.mean(dim)


    return title_description_metadata


# del dat
# print(title_and_description_metadata)
title_and_description_embedding = make_data_embedding(title_and_description_metadata)
# title_and_description_metadata.apply(make_data_embedding)

RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 3588096 bytes.

In [None]:
description_embedding_list = list(title_and_description_embedding[:, 1])
title_list = title_and_description_embedding[:, 0]

embed_list = torch.cat(description_embedding_list, dim=0)
red = reducer.fit_transform(embed_list.detach().numpy())


In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import Spectral10, Category20c
from bokeh.palettes import magma
import pandas as pd
output_notebook()


In [None]:
def make_plot(red, title_list, number=200, color = True, color_mapping_cat=None, color_cats = None, bg_color="white"):
    digits_df = pd.DataFrame(red, columns=('x', 'y'))
    if color_mapping_cat:
        digits_df['colors'] = color_mapping_cat
    digits_df['digit'] = title_list
    datasource = ColumnDataSource(digits_df)
    plot_figure = figure(
    title='UMAP projection of the article title embeddings',
    width=890,
    height=600,
    tools=('pan, wheel_zoom, reset'),
    background_fill_color = bg_color
    )
    plot_figure.legend.location = "top_left",
    plot_figure.add_tools(HoverTool(tooltips="""
    <div>
    <div>
        <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 10px; color: #224499'></span>
        <span style='font-size: 10px'>@digit</span>
    </div>
    </div>
    """))
    if color:
        color_mapping = CategoricalColorMapper(factors=title_list, palette=magma(number))
        plot_figure.circle(
            'x',
            'y',
            source=datasource,
            color=dict(field='digit', transform=color_mapping),
            line_alpha=0.6,
            fill_alpha=0.6,
            size=7
        )
        show(plot_figure)
    elif color_mapping_cat:
        color_mapping = CategoricalColorMapper(factors=color_cats, palette=magma(len(color_cats)+2)[2:])
        plot_figure.circle(
            'x',
            'y',
            source=datasource,
            color=dict(field='colors', transform=color_mapping),
            line_alpha=0.6,
            fill_alpha=0.6,
            size=8,
            legend_field='colors'
        )
        show(plot_figure)
    else:

        plot_figure.circle(
            'x',
            'y',
            source=datasource,
            color=dict(field='digit'),
            line_alpha=0.6,
            fill_alpha=0.6,
            size=7
        )
        show(plot_figure)

make_plot(red, title_list, number=200)