In [1]:
%pip install transformers
%pip install tensorflow



In [2]:
import pickle
import pandas as pd
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import tensorflow as tf
import torch
from transformers import BertTokenizer, BertModel
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
import time

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('preprocessed_data.csv')

In [4]:
df['sentence_stemmed'][0]

'especi problem reduc ident effort vt identifi solut stem interview reveal lack knowledg exist bodi literatur'

In [5]:
# for original sentences
# determine max length
# Assuming data_clean is a list of texts
data_clean_special = ["[CLS] " + text + " [SEP]" for text in df['sentence_stemmed']]
print("First list entry after adding special tokens:", df['sentence_stemmed'][0])

First list entry after adding special tokens: especi problem reduc ident effort vt identifi solut stem interview reveal lack knowledg exist bodi literatur


In [6]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize each text in data_clean_special
tokenized_texts = [tokenizer.tokenize(text) for text in data_clean_special]
print("First list entry after tokenization:", tokenized_texts[0])

First list entry after tokenization: ['[CLS]', 'es', '##pe', '##ci', 'problem', 'red', '##uc', 'id', '##ent', 'effort', 'vt', 'id', '##ent', '##if', '##i', 'sol', '##ut', 'stem', 'interview', 'reveal', 'lack', 'know', '##led', '##g', 'exist', 'bo', '##di', 'liter', '##at', '##ur', '[SEP]']


In [7]:
token_lengths = [len(tokens) for tokens in tokenized_texts]

# Determine the 85%-quantile for the maximum sequence length
max_seq_length = int(np.quantile(token_lengths, 0.85))

print("Maximum sequence length (85%-quantile):", max_seq_length)

Maximum sequence length (85%-quantile): 38


In [8]:
# sentences_padded = pad_sequences(tokenized_texts, dtype=object,
# maxlen=27, value='[PAD]', truncating="post",
# padding="post")

In [9]:
# # Examine the first list entry
# print("First list entry after padding:", sentences_padded[0])

In [10]:
# # Convert tokens to their ids using the BERT tokenizer
# sentences_converted = [bert_uncased.convert_tokens_to_ids(s) for s in sentences_padded]

# # Examine the first list entry
# print("First list entry after token conversion:", sentences_converted[0])

In [11]:
# Assuming data_clean is a list of texts
sentences_converted_quick = [[tokenizer.encode(s, add_special_tokens=True, padding='max_length', truncation='longest_first', max_length=27)] for s in data_clean_special]

# Examine the first list entry
print("First list entry after quick conversion:", sentences_converted_quick[0])

First list entry after quick conversion: [[101, 101, 9686, 5051, 6895, 3291, 2417, 14194, 8909, 4765, 3947, 28879, 8909, 4765, 10128, 2072, 14017, 4904, 7872, 4357, 7487, 3768, 2113, 3709, 2290, 4839, 102]]


In [12]:
# Convert the token IDs to torch tensors
inputs = torch.tensor(sentences_converted_quick)

# Check the shape of the tensor
print("Shape of inputs tensor:", inputs.size())

Shape of inputs tensor: torch.Size([1000, 1, 27])


In [13]:
model = BertModel.from_pretrained('bert-base-uncased')

In [14]:
with torch.no_grad():
    outputs = model(inputs[0])

# Extract the embeddings for the first token of the first sequence
embeddings = outputs.last_hidden_state[0][0].numpy()

# Print the shape of the embeddings
print("Shape of embeddings:", embeddings.shape)

Shape of embeddings: (768,)


In [15]:
embeddings[:10]

array([-0.39275855, -0.16091648, -0.23698685,  0.01786455, -0.18520185,
       -0.0474395 ,  0.30472496,  0.29422417, -0.15951036, -0.20334853],
      dtype=float32)

In [16]:
def get_embeddings_for_document(document, model, tokenizer, max_length=27):
    # Convert sentence to token IDs and pad/truncate
    encoded_input = tokenizer.encode_plus(document,
                                          add_special_tokens=True,
                                          padding='max_length',
                                          truncation='longest_first',
                                          max_length=max_length,
                                          return_tensors='pt')
    input_ids = encoded_input['input_ids']

    # Get embeddings for the current document
    with torch.no_grad():
        outputs = model(input_ids)

    # Extract the embeddings for the [CLS] token
    embeddings = outputs.last_hidden_state[0][0].numpy()

    return embeddings


# Initialize BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Get embeddings for all documents
all_embeddings = []
for document in data_clean_special:
    embeddings = get_embeddings_for_document(document, model, tokenizer)
    all_embeddings.append(embeddings)

# Convert the list of embeddings to a NumPy array for feeding into a machine learning model
all_embeddings_array = np.array(all_embeddings)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [17]:
all_embeddings_array

array([[-0.39275855, -0.16091648, -0.23698685, ..., -0.32612094,
         0.00503223,  0.6350243 ],
       [-0.4469154 ,  0.06083094,  0.13548426, ..., -0.15917772,
         0.2170959 ,  0.299105  ],
       [-0.45805764, -0.03506152,  0.07624942, ..., -0.41967356,
         0.0364986 ,  0.4474135 ],
       ...,
       [-0.24265134,  0.00789313,  0.13230205, ..., -0.32727203,
        -0.14761877,  0.44802603],
       [-0.4691852 ,  0.16733831, -0.00452957, ..., -0.5670947 ,
        -0.23964113,  0.6777006 ],
       [-0.3884566 , -0.18661669,  0.25578082, ..., -0.10989109,
         0.5590267 ,  0.08543449]], dtype=float32)

In [18]:
df_embeddings = pd.DataFrame(all_embeddings_array)

In [19]:
df_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.392759,-0.160916,-0.236987,0.017865,-0.185202,-0.047439,0.304725,0.294224,-0.159510,-0.203349,...,0.017093,-0.022737,0.154470,-0.062244,0.018911,0.027818,0.030458,-0.326121,0.005032,0.635024
1,-0.446915,0.060831,0.135484,-0.070987,-0.312183,-0.161954,0.375770,0.113498,-0.177271,-0.179937,...,0.217384,-0.187295,0.311242,0.138686,0.105849,-0.209970,-0.040666,-0.159178,0.217096,0.299105
2,-0.458058,-0.035062,0.076249,-0.175648,-0.267310,-0.133200,0.138207,0.147976,-0.020746,-0.216210,...,0.087419,-0.325568,-0.086733,-0.164657,-0.044922,0.176228,-0.106638,-0.419674,0.036499,0.447414
3,-0.490693,-0.300984,-0.054388,-0.211093,-0.005908,-0.158410,0.255847,0.303913,-0.253720,-0.234790,...,0.428795,-0.181079,0.330739,0.126934,-0.027351,-0.176352,0.148738,-0.554723,-0.036231,0.812043
4,-0.236580,0.074777,0.081607,-0.124781,-0.228186,-0.334135,0.245351,0.184129,0.025996,-0.131428,...,0.215896,-0.316871,-0.150677,-0.105422,0.026321,0.287165,-0.136124,-0.674244,-0.115754,0.823650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.336847,0.030665,0.035846,-0.009195,-0.038633,-0.021274,-0.058902,0.115932,0.036479,-0.072085,...,0.160017,-0.254421,0.144746,-0.065655,0.040110,0.247874,-0.064700,-0.326328,-0.010185,0.934169
996,-0.487982,-0.325495,0.213906,0.040076,-0.150274,0.061100,0.082486,0.150475,-0.268557,-0.159566,...,0.229470,0.025018,0.137159,0.063406,0.278236,-0.192878,-0.004302,-0.127661,0.478828,0.311341
997,-0.242651,0.007893,0.132302,-0.216239,0.201438,0.183189,0.205393,0.111767,-0.313924,-0.206432,...,0.095540,-0.293471,0.070165,0.059316,0.231394,-0.103824,-0.211086,-0.327272,-0.147619,0.448026
998,-0.469185,0.167338,-0.004530,-0.293314,-0.088417,0.203098,0.707114,0.249263,-0.162943,-0.643819,...,0.279231,-0.450037,0.101883,-0.068950,0.211745,-0.147089,-0.405728,-0.567095,-0.239641,0.677701


In [22]:
# Assign column names in the format 'sentence_embeddings + number'
num_dimensions = all_embeddings_array.shape[1]
column_names = [f'sentence_embeddings_{i+1}' for i in range(num_dimensions)]
df_embeddings.columns = column_names

In [23]:
df_embeddings

Unnamed: 0,sentence_embeddings_1,sentence_embeddings_2,sentence_embeddings_3,sentence_embeddings_4,sentence_embeddings_5,sentence_embeddings_6,sentence_embeddings_7,sentence_embeddings_8,sentence_embeddings_9,sentence_embeddings_10,...,sentence_embeddings_759,sentence_embeddings_760,sentence_embeddings_761,sentence_embeddings_762,sentence_embeddings_763,sentence_embeddings_764,sentence_embeddings_765,sentence_embeddings_766,sentence_embeddings_767,sentence_embeddings_768
0,-0.392759,-0.160916,-0.236987,0.017865,-0.185202,-0.047439,0.304725,0.294224,-0.159510,-0.203349,...,0.017093,-0.022737,0.154470,-0.062244,0.018911,0.027818,0.030458,-0.326121,0.005032,0.635024
1,-0.446915,0.060831,0.135484,-0.070987,-0.312183,-0.161954,0.375770,0.113498,-0.177271,-0.179937,...,0.217384,-0.187295,0.311242,0.138686,0.105849,-0.209970,-0.040666,-0.159178,0.217096,0.299105
2,-0.458058,-0.035062,0.076249,-0.175648,-0.267310,-0.133200,0.138207,0.147976,-0.020746,-0.216210,...,0.087419,-0.325568,-0.086733,-0.164657,-0.044922,0.176228,-0.106638,-0.419674,0.036499,0.447414
3,-0.490693,-0.300984,-0.054388,-0.211093,-0.005908,-0.158410,0.255847,0.303913,-0.253720,-0.234790,...,0.428795,-0.181079,0.330739,0.126934,-0.027351,-0.176352,0.148738,-0.554723,-0.036231,0.812043
4,-0.236580,0.074777,0.081607,-0.124781,-0.228186,-0.334135,0.245351,0.184129,0.025996,-0.131428,...,0.215896,-0.316871,-0.150677,-0.105422,0.026321,0.287165,-0.136124,-0.674244,-0.115754,0.823650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.336847,0.030665,0.035846,-0.009195,-0.038633,-0.021274,-0.058902,0.115932,0.036479,-0.072085,...,0.160017,-0.254421,0.144746,-0.065655,0.040110,0.247874,-0.064700,-0.326328,-0.010185,0.934169
996,-0.487982,-0.325495,0.213906,0.040076,-0.150274,0.061100,0.082486,0.150475,-0.268557,-0.159566,...,0.229470,0.025018,0.137159,0.063406,0.278236,-0.192878,-0.004302,-0.127661,0.478828,0.311341
997,-0.242651,0.007893,0.132302,-0.216239,0.201438,0.183189,0.205393,0.111767,-0.313924,-0.206432,...,0.095540,-0.293471,0.070165,0.059316,0.231394,-0.103824,-0.211086,-0.327272,-0.147619,0.448026
998,-0.469185,0.167338,-0.004530,-0.293314,-0.088417,0.203098,0.707114,0.249263,-0.162943,-0.643819,...,0.279231,-0.450037,0.101883,-0.068950,0.211745,-0.147089,-0.405728,-0.567095,-0.239641,0.677701


In [24]:
df_embeddings.to_csv('sentence_embeddings.csv', index=False)