In [1]:
import pandas as pd
import numpy as np
import torch

import sys
import time

import re

from transformers import BertTokenizerFast, BertModel
from datasets import load_dataset

from pprint import pprint
import io

import logging
logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt

from helper import stream

import psycopg2

# Length of dataset: 2,326,839

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda', index=0)

In [2]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [3]:
data_stream = stream()
next(data_stream).keys()

dict_keys(['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed'])

In [4]:
def tokenize_data(data):
    tokenized_data = tokenizer(data, padding=True, truncation=True, return_tensors='pt')
    tokenized_data.to(device)
    return tokenized_data

tokenize_data(next(data_stream)['abstract'])

{'input_ids': tensor([[  101,  2057,  6235,  1037,  2047,  9896,  1010,  1996,  1002,  1006,
          1047,  1010,  1032,  3449,  2140,  1007,  1002,  1011, 21877, 11362,
          2208,  2007,  6087,  1010,  1998,  2224,  2009,  6855,  1037, 23191,
          1997,  1996,  2155,  1997,  1002,  1006,  1047,  1010,  1032,  3449,
          2140,  1007,  1002,  1011, 20288, 19287,  1998,  9896,  2594,  7300,
          2000,  1037,  2155,  1997,  3471,  7175,  3392, 22511,  2015,  1997,
         19287,  1012,  2569, 12107,  1997, 20288, 19287,  3711,  1999, 11841,
          3012,  3399,  1998,  2031,  2363,  3445,  3086,  1999,  3522,  2086,
          1012,  1999,  3327,  1010,  2256,  6910, 28962,  2236,  4697,  1998,
         12919,  1996,  3025,  3463,  1997,  3389,  1998,  2358,  2890,  2378,
          2226,  1998,  2507,  1037,  2047,  6947,  1997,  1996, 10722,  4674,
          1011, 10594,  1011,  3766, 23191,  1997, 19679, 28775,  3723,  1012,
          2057,  2036,  2556,  1037,  

In [5]:
def vec_to_sql_string(vector):
    vector = vector.cpu().numpy().tolist()
    sql_string = ['"' + str(i) + '"' for i in vector]
    
    sql_string = '{' + ', '.join(sql_string) + '}'
    
    return sql_string

def from_sql_to_list(string):
    sql_string = re.findall(r'\[.*?\]', string)
    lst = [eval(i) for i in sql_string]
    return torch.tensor(np.array(lst), device=device, dtype=torch.float64)

def connect_to_db():
    conn = psycopg2.connect(
        host="localhost",
        port=5432,
        database="vector_database",
        user="postgres",
        password="admin"
    )
    conn.autocommit = True

    cur = conn.cursor()
    cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
    return cur, conn

cur, conn = connect_to_db()
def save_batch(batch):    
    for i in range(len(batch)):        
        sql_string = vec_to_sql_string(batch[i]['Embedding'])
        cur.execute("INSERT INTO article_embeddings VALUES (%s, %s, %s, %s)", (i, batch[i]['Title'], batch[i]['Abstract'], sql_string))



In [8]:
def embed(article):
    tokenized = tokenize_data(article)
    with torch.no_grad():
        output = model(**tokenized)
        hidden_states = output.hidden_states
        embedding = hidden_states[-2]
    return embedding

embed(next(data_stream)['abstract']).shape

torch.Size([1, 69, 768])

In [7]:
def embed_and_append_to_csv(output_csv='vector_db.csv'):
    np.set_printoptions(threshold=sys.maxsize)
    batch = []
    batch_size = 32

    for article in data_stream:
        embedding = embed(article)
        batch.append({'Title': article['title'], 'Abstract': article['abstract'], 'Embedding': embedding.detach().cpu().numpy()})
        if len(batch) >= batch_size:        
            embeddings_df = pd.DataFrame(batch)
            print(len(embeddings_df))
            embeddings_df.to_csv(output_csv, mode='a', header=False, index=False)
            batch = []
            break

embed_and_append_to_csv()

32


KeyboardInterrupt: 

In [8]:
cur.execute("drop table article_embeddings;")
cur.execute("create table article_embeddings(article_ID int primary key, article_title varchar, abstract varchar, embedding vector(768)[]);")
conn.commit()

In [22]:
%%time
def embed_data(data_stream=data_stream):
    batch = []
    embeddings = []
    batch_lim = 32
    for article in data_stream:
        emb = embed(article)
        batch.append({'Title': article['title'], 'Abstract': article['abstract'], 'Embedding':torch.Tensor(emb.cpu().numpy())})
        if len(batch) >= batch_lim:
            # save_batch(batch)
            embeddings.append(batch)
            batch = []
    save_batch(batch)
embed_data()

# Estimated time(in days) = (((WallTime/batch_lim) * 2'326'839)/3600)/24

# TODO: Add a progress bar
# TODO: Add a way to resume from a checkpoint
# TODO: Save the batck to a csv instead, then use bulk copy to insert into the database
# TODO: embed the articles in batches, bert can take batches of
# TODO: start wiht

CPU times: total: 109 ms
Wall time: 788 ms


In [None]:
def embed(batch):
    # Tokenize the articles in batches of 10
    tokenized = tokenize_data(batch)  # Assuming tokenize_data accepts a list of articles
    
    with torch.no_grad():
        output = model(**tokenized)
        hidden_states = output.hidden_states
        embedding = hidden_states[-2]
    return embedding

batch = []
batch_size = 10
for i in range(batch_size):
    batch.append(next(data_stream)['abstract'])

embed(batch).shape