# Analysis of the data


In [11]:
from datasets import load_dataset
from transformers import AutoTokenizer


dataset = load_dataset("esnli")

# Take subset of data


Reusing dataset esnli (/Users/julianbruinsma/.cache/huggingface/datasets/esnli/plain_text/0.0.2/a160e6a02bbb8d828c738918dafec4e7d298782c334b5109af632fec6d779bbc)
100%|██████████| 3/3 [00:00<00:00, 20.38it/s]


In [13]:
import pandas as pd
# Get features from the dataset
train = pd.DataFrame.from_dict(dataset['train'])

# Take first 10 examples
df = train.iloc[:10]


print(type(df['premise']))

# from train dataframe, get the premise and hypothesis
premise = df['premise']
hypothesis = df['hypothesis']

# print(premise)
# print(hypothesis)

<class 'pandas.core.series.Series'>


In [19]:
import torch 
# import pad_sequence
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset

tokenizer = AutoTokenizer.from_pretrained("t5-base", use_fast=True)

# concatenate premise and hypothesis with separator token </s> 
input_ids = []
attention_mask = []
token_type_ids = []
target_ids = []

premise_list = df["premise"].tolist()
hypothesis_list = df["hypothesis"].tolist()
explanation_list = df["explanation_1"].tolist()
label_list = df["label"].tolist()

for (premise, hypothesis, explanation, label) in zip(premise_list, hypothesis_list, explanation_list, label_list):
    premise_hypothesis = f"{premise} </s> {hypothesis}"
    
    # make switch case for label if it is 0 label = entailment, if it is 1 label = neutral, if it is 2 label = contradiction
    if label == 0:
        label = "entailment"
    elif label == 1:
        label = "neutral"
    elif label == 2:
        label = "contradiction"
    
    # Concatenate label with explanation

    label_explanation = f"{label} </s> {explanation} </s>" 
    print(label_explanation)
    
    
    hypothesis_premise_tokens = tokenizer.encode_plus(
        premise_hypothesis,
        truncation=True, 
        return_token_type_ids=True, 
        max_length=128,
        )
    
    target_encoding = tokenizer.encode_plus(
        label_explanation,
        truncation=True,
        padding="longest",
        return_token_type_ids=True,
        max_length=128,
    )
    

    token_type_ids.append(torch.Tensor(hypothesis_premise_tokens.token_type_ids))
    attention_mask.append(torch.Tensor(hypothesis_premise_tokens.attention_mask))
    input_ids.append(torch.Tensor(hypothesis_premise_tokens.input_ids))
    target_ids.append(torch.Tensor(target_encoding.input_ids))

token_type_ids = pad_sequence(token_type_ids, batch_first=True)
attention_mask = pad_sequence(attention_mask, batch_first=True)
input_ids = pad_sequence(input_ids, batch_first=True)
target = pad_sequence(target_ids, batch_first=True)

dataset = TensorDataset(
    input_ids,
    attention_mask,
    token_type_ids,
    target
)

print(target_encoding)
print(hypothesis_premise_tokens)


neutral </s> the person is not necessarily training his horse </s>
contradiction </s> One cannot be on a jumping horse cannot be a diner ordering food. </s>
entailment </s> a broken down airplane is outdoors </s>
neutral </s> Just because they are smiling and waving at a camera does not imply their parents or anyone is anyone behind it </s>
entailment </s> The children must be present to see them smiling and waving. </s>
contradiction </s> One cannot be smiling and frowning at the same time. </s>
contradiction </s> One cannot be in the middle of a bridge if they are on the sidewalk. </s>
entailment </s> jumping on skateboard is the same as doing trick on skateboard. </s>
neutral </s> Just because the boy is jumping on a skateboard does not imply he is wearing safety equipment </s>
neutral </s> it is not necessarily true the man drinks his juice </s>
{'input_ids': [7163, 1, 34, 19, 59, 6539, 1176, 8, 388, 6750, 112, 5143, 1, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

target_device = target.to(device).long()




# get shape of target device
# print(target_device[:, :-1].contiguous())
# 
# 
# print("detaching")
# print(target_device[:, 1:].clone().detach())
# print(target_device[0].clone().detach())

lm_labels = target_device[:, 1:].clone().detach()
lm_labels[target_device[:, 1:] == tokenizer.pad_token_id] = -100 
print(lm_labels)


tensor([[  568,    19,    59,  6539,   761,   112,  4952,     1,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100],
        [ 1178,    36,    30,     3,     9, 15539,  4952,  1178,    36,     3,
             9,   176,    49, 12320,   542,     5,     1,  -100,  -100,  -100,
          -100,  -100,  -100,  -100],
        [    9,  4335,   323, 20527,    19, 10962,     1,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100],
        [  250,    79,    33, 20770,    11,  8036,  3745,    44,     3,     9,
          1861,   405,    59,     3, 18531,    70,  1362,    42,  1321,    19,
          1321,  1187,    34,     1],
        [  502,   398,    36,   915,    12,   217,   135, 20770,    11,  8036,
          3745,     5,     1,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100],
        [ 1178,    3

In [1]:
from dataloader import esnli

print("initializing dataset")
dataset = esnli()
print("initializing data loader")
train_loader, val_loader, test_loader = dataset.get_data_loaders()

print(type(train_loader))


initializing dataset


Reusing dataset esnli (/Users/julianbruinsma/.cache/huggingface/datasets/esnli/plain_text/0.0.2/a160e6a02bbb8d828c738918dafec4e7d298782c334b5109af632fec6d779bbc)
100%|██████████| 3/3 [00:00<00:00, 75.94it/s]


initializing data loader
initializing train data loader
initializing val data loader
initializing test data loader
<class 'torch.utils.data.dataloader.DataLoader'>


In [9]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")
input_ids = tokenizer.encode('summarize: I enjoy walking with my cute dog', return_tensors='pt')
greedy_output = model.generate(input_ids, num_beams=7, no_repeat_ngram_size=2, min_length=50, max_length=100)
print("Output:\n" + 100 * '-')

print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog - he is a joy to walk with and is very affectionate with me. I like to spend time with him on walks with his kitty cat, leo, who is so cute!


In [7]:
from torch.optim import AdamW
from transformers import T5ForConditionalGeneration
import torch
from transformers import AutoTokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small") 

tokenizer = AutoTokenizer.from_pretrained("t5-base", use_fast=True)


optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.train()
for batch, (
    token_type_ids, 
    attention_mask, 
    input_ids, 
    target_ids
) in enumerate(train_loader):
    
    optimizer.zero_grad()
    token_type_ids = token_type_ids.to(device).long()
    attention_mask = attention_mask.to(device).long()
    input_ids = input_ids.to(device).long()
    target_ids = target_ids.to(device).long()
    
    # Set the target ids and labels 
    y_ids = target_ids[:, :-1].contiguous()
    lm_labels = target_ids[:, 1:].clone().detach()
    
    # Make sure that lm_labels are not set for 0 (pad) tokens
    lm_labels[target_ids[:, 1:] == tokenizer.pad_token_id] = -100 
    
    
    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=y_ids,
        labels=lm_labels,
    )
    # print shape of outputs
    outputs_copy = outputs 
    loss = outputs[0]
    loss.backward()
    optimizer.step()

    
    

    
    


tensor([[ 555, 1178,   36,  ...,    0,    0,    0],
        [ 466,    8, 1021,  ...,    0,    0,    0],
        [   8,  861,   19,  ...,    0,    0,    0],
        ...,
        [  37, 4940,   19,  ...,    0,    0,    0],
        [ 290,   19,  893,  ...,    0,    0,    0],
        [  37, 1076,   33,  ...,    0,    0,    0]])


KeyboardInterrupt: 

In [37]:
# Load t5_model.pt form local directory
from torch.optim import AdamW
from transformers import T5ForConditionalGeneration
import torch
from transformers import AutoTokenizer
# import pad_sequence
from torch.nn.utils.rnn import pad_sequence

model = T5ForConditionalGeneration.from_pretrained("t5-base")
model.load_state_dict(torch.load("t5_model.pt"))

target_ids_list = []
attention_mask_list = []

temp_encoding = tokenizer.encode_plus('This church choir sings to the masses as they sing joyous songs from the book at a church. </s> The church has cracks in the ceiling. </s>', 
                truncation=True, 
                return_token_type_ids=True, 
                max_length=256)

input_ids = torch.Tensor(temp_encoding.input_ids)
attention_mask = torch.Tensor(temp_encoding.attention_mask)

target_ids_list.append(input_ids)
attention_mask_list.append(attention_mask)

target_ids_list = pad_sequence(target_ids_list, batch_first=True)
attention_mask_list = pad_sequence(attention_mask_list, batch_first=True)

output = model.generate(
    input_ids = input_ids, 
    attention_mask = attention_mask, 
    num_beams=7, 
    no_repeat_ngram_size=2, 
    min_length=50, 
    max_length=80
    )
print("Output:\n" + 100 * '-')
print(tokenizer.decode(output[0], skip_special_tokens=True))

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

generated_ids = model.generate(
                input_ids = input_ids,
                attention_mask = attention_mask,
                num_beams = 10,
                max_length = 100,
                repetition_penalty = 1.0,
                length_penalty = 1.0,
                early_stopping = True,
                use_cache = True,
                )

preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

print(preds)