#### Dataset Preparation

In [1]:
import torch

In [2]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv("in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.sample(10)

Number of training sentences: 8,551



Unnamed: 0,sentence_source,label,label_notes,sentence
3749,ks08,1,,His girlfriend bought this computer.
3890,ks08,1,,John kept him behind the garage.
2203,l-93,0,*,I shaved myself.
7985,ad03,1,,The constant reading of Shakespeare satisfied me
5702,c_13,1,,The big boy was kissed by the drooling dog.
4092,ks08,1,,John tried to make Sam let George ask Bill to ...
6816,m_02,0,*,George both built the houses.
1982,r-67,1,,I gave Jack a picture of myself.
6851,m_02,1,,Only to the best students would he give this b...
3632,ks08,1,,"John might go home, so might Bill."


In [3]:
# Get the lists of sentences and their labels.
train_batch = df.sentence.values
train_labels = df.label.values

In [4]:
train_batch = list(train_batch)

In [5]:
train_labels = torch.tensor(train_labels)

In [6]:
print(train_labels.shape)

torch.Size([8551])


#### Code for training the XLnet

In [7]:
from transformers import XLNetForSequenceClassification

In [8]:
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased').cuda()

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

### Create dummy model for experiment

In [9]:
# import copy
# model_exp =  copy.deepcopy(model)

In [10]:
model.load_state_dict(torch.load("XLNet.pt"))

<All keys matched successfully>

In [11]:
# for v, g in zip(model.parameters(), model_exp.parameters()):
#     print(torch.sum(v.data - g.data))

#### Family of Optimizers

In [12]:
optimizer = torch.optim.SGD(model.parameters(),1e-5,momentum=0.9,weight_decay=3e-4)

In [13]:
# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]
# optimizer = torch.optim.SGD(optimizer_grouped_parameters,5e-4,momentum=0.99)

In [14]:
# from transformers import AdamW

# optimizer = AdamW(model.parameters(), lr=1e-5)

In [15]:
# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]
# optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

#### Tokenize

In [16]:
from transformers import XLNetTokenizer
# Load the XLNet tokenizer.
print('Loading XLNet Tokenizer...')
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

Loading XLNet Tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




In [17]:
# !pip install sentencepiece

In [18]:
encoding = tokenizer(train_batch, return_tensors='pt', padding=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [19]:
# attention_mask

In [20]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.
batch_size = 16
# Create the DataLoader for our training set.
train_data = TensorDataset(input_ids, attention_mask, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [29]:
epochs = 1

In [30]:
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs, eta_min=1e-3)

In [31]:

for eps in range(epochs):
    # For each batch of training data...
    epoch_loss = 0
    for step, batch in enumerate(train_dataloader):
        
        model.train()
        optimizer.zero_grad()
        out = model(batch[0].cuda(), attention_mask=batch[1].cuda(), labels=batch[2].cuda(), return_dict=True)
        loss = out.loss
        epoch_loss = epoch_loss + loss
        loss.backward()
        optimizer.step()
        
        
    # For each batch of training data... print loss and the batch number
    print("Epoch: %d, Loss: %.3f" % (eps, epoch_loss/step))

Epoch: 0, Loss: 0.045


In [32]:
torch.save(model.state_dict(), "XLNet.pt")

#### Testing

In [33]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv("in_domain_dev.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.sample(10)

Number of training sentences: 527



Unnamed: 0,sentence_source,label,label_notes,sentence
106,r-67,1,,"Fluffy is sick, which not everybody knows."
458,sks13,0,*,Mary revealed himself to John.
283,ks08,0,*,Did the child be in the school?
101,r-67,1,,I talked to Winston about himself.
366,c_13,1,,Sylvia was slapping Jeff upside the head in ma...
87,r-67,1,,How sane is Peter?
525,ad03,1,,I shaved myself.
67,bc01,1,,We elected me.
33,bc01,0,*,What the water did to the whole bottle was fil...
386,d_98,1,,Any owl hunts mice.


In [34]:
# Get the lists of sentences and their labels.
test_batch = df.sentence.values
test_labels = df.label.values

In [35]:
test_batch = list(test_batch)
test_labels = torch.tensor(test_labels)
print(test_labels.shape)

torch.Size([527])


In [36]:
encoding = tokenizer(test_batch, return_tensors='pt', padding=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [37]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.
batch_size = 16
# Create the DataLoader for our training set.
test_data = TensorDataset(input_ids, attention_mask, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [39]:
total_err = 0

for step, batch in enumerate(test_dataloader):
        
    out = model(batch[0].cuda(), attention_mask=batch[1].cuda(), return_dict=True)
    logits = out.logits
    _ , out_labels = logits.softmax(dim=1).max(dim = 1)

    error_score = abs((out_labels - batch[2].cuda()).sum())
    
    total_err = total_err + error_score.item()
    
    print(error_score)

tensor(1, device='cuda:0')
tensor(1, device='cuda:0')
tensor(3, device='cuda:0')
tensor(1, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(1, device='cuda:0')
tensor(2, device='cuda:0')
tensor(1, device='cuda:0')
tensor(2, device='cuda:0')
tensor(2, device='cuda:0')
tensor(1, device='cuda:0')
tensor(0, device='cuda:0')
tensor(2, device='cuda:0')
tensor(2, device='cuda:0')
tensor(2, device='cuda:0')
tensor(1, device='cuda:0')
tensor(2, device='cuda:0')
tensor(0, device='cuda:0')
tensor(2, device='cuda:0')
tensor(1, device='cuda:0')
tensor(2, device='cuda:0')
tensor(2, device='cuda:0')
tensor(0, device='cuda:0')
tensor(2, device='cuda:0')
tensor(1, device='cuda:0')
tensor(1, device='cuda:0')
tensor(1, device='cuda:0')
tensor(1, device='cuda:0')
tensor(1, device='cuda:0')
tensor(1, device='cuda:0')
tensor(2, device='cuda:0')
tensor(0, device='cuda:0')


In [40]:
print(1 - total_err/527, 527 - total_err)

0.9222011385199241 486
