In [1]:
## Importing all required libraries
import torch
from transformers import BertTokenizer, BertModel
import os
import numpy as np
from pathlib import Path
import json
from tqdm import tqdm

In [2]:
## installing Transformers
#!pip install transformers

In [3]:
## Loading the model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

## Setting model to evaluation mode
model.eval()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [4]:
## path for the training and testing files
main_input_path = '/Users/ahmed/Desktop/SS23/practical-legalNLP/data'

path_cl_train = Path(main_input_path, 'CL','CL_train.json')
path_cl_dev = Path(main_input_path, 'CL','CL_dev.json')
path_cl_test = Path(main_input_path, 'CL','CL_test.json')

path_it_train = Path(main_input_path, 'IT','IT_train.json')
path_it_dev = Path(main_input_path, 'IT','IT_dev.json')
path_it_test = Path(main_input_path, 'IT','IT_test.json')

In [5]:
main_output_path = './bert_sentence_independent_embeddings'

output_cl_train = Path(main_output_path,'CL_train')
output_cl_dev = Path(main_output_path,'CL_dev.pkl')
output_cl_test = Path(main_output_path,'CL_test.pkl')

output_it_train = Path(main_output_path,'IT_train.pkl')
output_it_dev = Path(main_output_path,'IT_dev.pkl')
output_it_test = Path(main_output_path,'IT_test.pkl')

In [6]:
def load_json_file(path) -> dict:
    with open(path,'r') as f:
        d = json.load(f)
    return d

def save_pickle(path, d):
    # Check if d is a dictionary
    assert isinstance(d, dict), "Input must be a dictionary"

    # Check if the path ends with '.pkl' and add it if needed
    path  = path.with_suffix(".pkl")
    # Save the dictionary using pickle
    print(f'Saving data ..... in {path}')
    with open(path, "wb") as f:
        pickle.dump(d, f)

In [7]:
def get_sentence_bert_embeddings(tokenizer, model, sentence):
    tokenized_text = tokenizer.encode_plus(sentence, padding='max_length', truncation=True, return_tensors='pt', max_length=512)
    indexed_tokens = tokenized_text['input_ids']
    with torch.no_grad():
        outputs = model(indexed_tokens)
    sentence_embedding = outputs[0].squeeze()[0].flatten().tolist()
    return sentence_embedding

def get_doc_bert_embeddings(tokenizer, model, doc):
        sentences = doc['sentences']
        labels = doc['complete']
                
        if isinstance(sentences, str):
            sentences = sentences[2:-2].split('\', \'')
        
        # remove sentences with None labels
        new_sentences = []
        new_labels = []
        for j,l in enumerate(labels):
            if (l == 'None'):
                continue
            else:
                new_sentences.append(sentences[j])
                new_labels.append(labels[j])
        
        embeddings = []
        for idx in range(len(new_sentences)):
            sentence_embedding = get_sentence_bert_embeddings(tokenizer, model, new_sentences[idx])
            embeddings.append(sentence_embedding)
        return {'embeddings':embeddings, 'labels':new_labels}

            
def prepare_data(data_input_path, data_output_path, tokenizer, model):
        data_dict = load_json_file(data_input_path)
        data_output_dict = {}
        for file in tqdm(data_dict.keys()):
            doc = data_dict[file]
            doc_embedded = get_doc_bert_embeddings(tokenizer, model, doc)
            data_output_dict[file] = doc_embedded
        save_pickle(data_output_path, data_output_dict)

In [None]:
prepare_data(path_cl_train, output_cl_train, tokenizer, model)

 20%|████████▊                                   | 8/40 [02:38<12:13, 22.91s/it]

In [None]:
prepare_data(path_cl_dev, output_cl_dev, tokenizer, model)

In [None]:
prepare_data(path_cl_test, output_cl_test, tokenizer, model)

 80%|████████████████████████████████████         | 4/5 [01:38<00:25, 25.27s/it]

In [None]:
prepare_data(path_it_train, output_it_train, tokenizer, model)

In [None]:
prepare_data(path_it_dev, output_it_dev, tokenizer, model)

In [None]:
prepare_data(path_it_test, output_it_test, tokenizer, model)