In [108]:
import pandas as pd
import random
import numpy as np
import torch
from transformers import AutoTokenizer
from transformers import TrainingArguments


from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from tqdm import tqdm

In [14]:
#load data
data = pd.read_pickle('data/processed/bert_data.pickle')

In [83]:
## Create dataloader
random_seed = 3060 #To replicate
number_of_sentences_pr_doc = 20  
sentence_pairs = [] #Output list of training examples

random.seed(random_seed) 
np.random.seed(random_seed)

#Create sentence pairs. Output is a list of dicts containing keys "texts" with two sentences and "label"
for bill in tqdm.tqdm(data['bill_id'].unique()):
    sentences_bill_x = data[data['bill_id']==bill]['sentences'].iat[0]
    n_sample = min(number_of_sentences_pr_doc, len(sentences_bill_x))
    sentence_one = random.sample(sentences_bill_x,k=n_sample) 
    same_document_dummy = np.random.choice([0, 1], size=(n_sample,))
    sentence_two_same_bill = [random.sample(sentences_bill_x,k=1) for _ in range(sum(same_document_dummy))]
    sentence_two_another_bill = [random.sample(data[data['bill_id']!=bill].sample(n=1,replace=False,random_state = random_seed)['sentences'].iat[0],k=1) for _ in range(n_sample - sum(same_document_dummy))]
    
    j = 0
    k = 0
    for x in range(len(sentence_one)):
        if same_document_dummy[x] == 1:
            sentence_pair = dict(texts=[sentence_one[x],sentence_two_same_bill[j][0]],label=1)
            j+=1
        elif same_document_dummy[x] == 0:
            sentence_pair = dict(texts=[sentence_one[x],sentence_two_another_bill[k][0]],label=0)
            k+=1
        sentence_pairs.append(sentence_pair)

100%|██████████| 22115/22115 [08:30<00:00, 43.36it/s]


In [112]:
#split data in training and validation
train_data, val_data = torch.utils.data.random_split(sentence_pairs, [300000, 22616],generator=torch.Generator().manual_seed(random_seed))

#prepare data for training
sentence_pairs_input = [InputExample(texts=x['texts'],label=x['label']) for x in train_data]
sentence1_val = [x['texts'][0] for x in val_data.dataset]
sentence2_val = [x['texts'][1] for x in val_data.dataset]
label_val = [x['label'] for x in val_data.dataset]
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)
evaluator = evaluation.BinaryClassificationEvaluator(sentence1_val,sentence2_val,label_val)


In [113]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
#Define loss function
train_loss = losses.SoftmaxLoss(model, model.get_sentence_embedding_dimension(), num_labels=2)

#Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=4, warmup_steps=100,evaluator=evaluator, evaluation_steps=500)

Iteration:   0%|          | 0/18750 [00:00<?, ?it/s]
Epoch:   0%|          | 0/4 [00:00<?, ?it/s]


AttributeError: 'dict' object has no attribute 'texts'

In [None]:
#Define the model. Either from scratch of by loading a pre-trained model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

