In [1]:
!pip install transformers
!pip install tensorflow



In [2]:
import pickle
import pandas as pd
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import tensorflow as tf
import torch
from transformers import BertTokenizer, BertModel
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
import time

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('preprocessed_data.csv')

In [4]:
df['entity'][0]

'interviews'

In [5]:
# for entity
# determine max length
# Assuming data_clean is a list of texts
data_clean_special = ["[CLS] " + text + " [SEP]" for text in df['entity']]
print("First list entry after adding special tokens:", df['entity'][0])

First list entry after adding special tokens: interviews


In [6]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize each entity in data_clean_special
tokenized_texts = [tokenizer.tokenize(text) for text in data_clean_special]
print("First list entry after tokenization:", tokenized_texts[0])

First list entry after tokenization: ['[CLS]', 'interviews', '[SEP]']


In [7]:
token_lengths = [len(tokens) for tokens in tokenized_texts]

# Determine the 85%-quantile for the maximum sequence length
max_seq_length = int(np.quantile(token_lengths, 0.85))

print("Maximum sequence length (85%-quantile):", max_seq_length)

Maximum sequence length (85%-quantile): 5


In [8]:
# sentences_padded = pad_sequences(tokenized_texts, dtype=object,
# maxlen=27, value='[PAD]', truncating="post",
# padding="post")

In [9]:
# # Examine the first list entry
# print("First list entry after padding:", sentences_padded[0])

In [10]:
# # Convert tokens to their ids using the BERT tokenizer
# sentences_converted = [bert_uncased.convert_tokens_to_ids(s) for s in sentences_padded]

# # Examine the first list entry
# print("First list entry after token conversion:", sentences_converted[0])

In [11]:
# Assuming data_clean is a list of texts
sentences_converted_quick = [[tokenizer.encode(s, add_special_tokens=True, padding='max_length', truncation='longest_first', max_length=27)] for s in data_clean_special]

# Examine the first list entry
print("First list entry after quick conversion:", sentences_converted_quick[0])

First list entry after quick conversion: [[101, 101, 7636, 102, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [12]:
# Convert the token IDs to torch tensors
inputs = torch.tensor(sentences_converted_quick)

# Check the shape of the tensor
print("Shape of inputs tensor:", inputs.size())

Shape of inputs tensor: torch.Size([1000, 1, 27])


In [13]:
model = BertModel.from_pretrained('bert-base-uncased')

In [14]:
with torch.no_grad():
    outputs = model(inputs[0])

# Extract the embeddings for the first token of the first sequence
embeddings = outputs.last_hidden_state[0][0].numpy()

# Print the shape of the embeddings
print("Shape of embeddings:", embeddings.shape)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Shape of embeddings: (768,)


In [15]:
embeddings[:10]

array([-0.49304637, -0.12750393,  0.35690206,  0.21373959, -0.28235322,
        0.04453138,  0.3793398 , -0.1241277 , -0.18539128, -0.05853705],
      dtype=float32)

In [16]:
def get_embeddings_for_document(document, model, tokenizer, max_length=27):
    # Convert entity to token IDs and pad/truncate
    encoded_input = tokenizer.encode_plus(document,
                                          add_special_tokens=True,
                                          padding='max_length',
                                          truncation='longest_first',
                                          max_length=max_length,
                                          return_tensors='pt')
    input_ids = encoded_input['input_ids']

    # Get embeddings for the current document
    with torch.no_grad():
        outputs = model(input_ids)

    # Extract the embeddings for the [CLS] token
    embeddings = outputs.last_hidden_state[0][0].numpy()

    return embeddings


# Initialize BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Get embeddings for all documents
all_embeddings = []
for document in data_clean_special:
    embeddings = get_embeddings_for_document(document, model, tokenizer)
    all_embeddings.append(embeddings)

# Convert the list of embeddings to a NumPy array for feeding into a machine learning model
all_embeddings_array = np.array(all_embeddings)

In [17]:
all_embeddings_array

array([[-0.49304637, -0.12750393,  0.35690206, ..., -0.2049888 ,
         0.59780824, -0.3042892 ],
       [-0.6185234 , -0.09725523,  0.311899  , ..., -0.10109094,
         0.40868673, -0.0291547 ],
       [-0.6588329 , -0.18423378,  0.14582519, ..., -0.09490737,
         0.4908951 ,  0.0941496 ],
       ...,
       [-0.37301475, -0.0550429 ,  0.3396399 , ..., -0.10037331,
         0.4363306 , -0.09867598],
       [-0.66338724, -0.17135727,  0.44374946, ..., -0.08367468,
         0.74569345, -0.3333531 ],
       [-0.4996609 , -0.10103997,  0.395641  , ..., -0.00661091,
         0.5597762 , -0.08405627]], dtype=float32)

In [18]:
df_embeddings = pd.DataFrame(all_embeddings_array)

In [19]:
df_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.493046,-0.127504,0.356902,0.213740,-0.282353,0.044531,0.379340,-0.124128,-0.185391,-0.058537,...,-0.154193,-0.001004,0.246955,0.011575,0.192595,-0.444240,0.069651,-0.204989,0.597808,-0.304289
1,-0.618523,-0.097255,0.311899,0.086365,-0.167940,0.105449,0.252365,-0.019911,-0.239084,-0.103387,...,0.014048,0.100635,0.286525,0.060596,0.377712,-0.413099,0.120353,-0.101091,0.408687,-0.029155
2,-0.658833,-0.184234,0.145825,0.208594,-0.222928,0.187397,0.152150,-0.052557,-0.162725,-0.089034,...,0.041876,0.201030,0.153229,-0.000881,0.230243,-0.232192,0.060281,-0.094907,0.490895,0.094150
3,-0.640721,-0.255062,0.486257,0.048884,-0.352733,0.011589,0.224444,-0.191657,-0.182377,0.051113,...,-0.156720,0.290073,0.214583,-0.193672,0.426431,-0.621424,0.149215,-0.127817,0.514537,-0.273813
4,-0.625892,-0.109920,0.290425,0.179737,-0.143044,0.105934,0.187089,-0.116364,-0.225682,0.008960,...,-0.002138,0.202907,0.136389,0.057703,0.229738,-0.305836,0.028529,-0.203359,0.449917,-0.028422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.645166,-0.155494,0.348832,0.127759,-0.200844,-0.147906,0.446472,-0.003472,-0.139285,-0.020009,...,-0.067549,0.174661,0.244805,-0.157010,0.277910,-0.622183,0.050523,-0.080757,0.651381,-0.170470
996,-0.645166,-0.155494,0.348832,0.127759,-0.200844,-0.147906,0.446472,-0.003472,-0.139285,-0.020009,...,-0.067549,0.174661,0.244805,-0.157010,0.277910,-0.622183,0.050523,-0.080757,0.651381,-0.170470
997,-0.373015,-0.055043,0.339640,0.184498,-0.205556,0.056685,0.196215,0.111421,-0.370837,-0.142796,...,-0.121257,0.100536,0.228765,0.053457,0.394366,-0.529538,0.120893,-0.100373,0.436331,-0.098676
998,-0.663387,-0.171357,0.443749,0.074684,-0.298812,-0.271282,0.444449,-0.104071,-0.157868,0.009690,...,-0.050234,0.148283,0.247628,-0.220014,0.352250,-0.642989,0.074952,-0.083675,0.745693,-0.333353


In [24]:
# Assign column names in the format 'entity_embeddings + number'
num_dimensions = all_embeddings_array.shape[1]
column_names = [f'entity_embeddings_{i+1}' for i in range(num_dimensions)]
df_embeddings.columns = column_names

In [25]:
df_embeddings

Unnamed: 0,entity_embeddings_1,entity_embeddings_2,entity_embeddings_3,entity_embeddings_4,entity_embeddings_5,entity_embeddings_6,entity_embeddings_7,entity_embeddings_8,entity_embeddings_9,entity_embeddings_10,...,entity_embeddings_759,entity_embeddings_760,entity_embeddings_761,entity_embeddings_762,entity_embeddings_763,entity_embeddings_764,entity_embeddings_765,entity_embeddings_766,entity_embeddings_767,entity_embeddings_768
0,-0.493046,-0.127504,0.356902,0.213740,-0.282353,0.044531,0.379340,-0.124128,-0.185391,-0.058537,...,-0.154193,-0.001004,0.246955,0.011575,0.192595,-0.444240,0.069651,-0.204989,0.597808,-0.304289
1,-0.618523,-0.097255,0.311899,0.086365,-0.167940,0.105449,0.252365,-0.019911,-0.239084,-0.103387,...,0.014048,0.100635,0.286525,0.060596,0.377712,-0.413099,0.120353,-0.101091,0.408687,-0.029155
2,-0.658833,-0.184234,0.145825,0.208594,-0.222928,0.187397,0.152150,-0.052557,-0.162725,-0.089034,...,0.041876,0.201030,0.153229,-0.000881,0.230243,-0.232192,0.060281,-0.094907,0.490895,0.094150
3,-0.640721,-0.255062,0.486257,0.048884,-0.352733,0.011589,0.224444,-0.191657,-0.182377,0.051113,...,-0.156720,0.290073,0.214583,-0.193672,0.426431,-0.621424,0.149215,-0.127817,0.514537,-0.273813
4,-0.625892,-0.109920,0.290425,0.179737,-0.143044,0.105934,0.187089,-0.116364,-0.225682,0.008960,...,-0.002138,0.202907,0.136389,0.057703,0.229738,-0.305836,0.028529,-0.203359,0.449917,-0.028422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.645166,-0.155494,0.348832,0.127759,-0.200844,-0.147906,0.446472,-0.003472,-0.139285,-0.020009,...,-0.067549,0.174661,0.244805,-0.157010,0.277910,-0.622183,0.050523,-0.080757,0.651381,-0.170470
996,-0.645166,-0.155494,0.348832,0.127759,-0.200844,-0.147906,0.446472,-0.003472,-0.139285,-0.020009,...,-0.067549,0.174661,0.244805,-0.157010,0.277910,-0.622183,0.050523,-0.080757,0.651381,-0.170470
997,-0.373015,-0.055043,0.339640,0.184498,-0.205556,0.056685,0.196215,0.111421,-0.370837,-0.142796,...,-0.121257,0.100536,0.228765,0.053457,0.394366,-0.529538,0.120893,-0.100373,0.436331,-0.098676
998,-0.663387,-0.171357,0.443749,0.074684,-0.298812,-0.271282,0.444449,-0.104071,-0.157868,0.009690,...,-0.050234,0.148283,0.247628,-0.220014,0.352250,-0.642989,0.074952,-0.083675,0.745693,-0.333353


In [26]:
df_embeddings.to_csv('entity_embeddings.csv', index=False)