In [None]:
!pip install sentencepiece
!pip install transformers
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import torch

import json
import pandas as pd
from torch.utils.data import DataLoader, Dataset
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data = list(parse_data('data.json'))
headlines = []

for i in (data):
  headlines.append(i['headline'])

In [18]:

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
#print(tokenizer)
inputs = tokenizer("Hello world!", return_tensors="pt")
outputs = model(**inputs)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
df = pd.DataFrame(data)
df['model_truth_values'] = df['is_sarcastic'].apply(lambda input: [0,1] if input == 1 else [1,0])
# df['is_sarcastic'].apply(lambda input: [0,1] if input == 1 else [1,0])
df['model_truth_values']

0        [0, 1]
1        [1, 0]
2        [1, 0]
3        [0, 1]
4        [0, 1]
          ...  
28614    [0, 1]
28615    [0, 1]
28616    [1, 0]
28617    [0, 1]
28618    [0, 1]
Name: model_truth_values, Length: 28619, dtype: object

In [20]:
df

Unnamed: 0,is_sarcastic,headline,article_link,model_truth_values
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...,"[0, 1]"
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...,"[1, 0]"
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...,"[1, 0]"
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...,"[0, 1]"
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...,"[0, 1]"
...,...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...,"[0, 1]"
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...,"[0, 1]"
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...,"[1, 0]"
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...,"[0, 1]"


In [21]:
df.is_sarcastic

0        1
1        0
2        0
3        1
4        1
        ..
28614    1
28615    1
28616    0
28617    1
28618    1
Name: is_sarcastic, Length: 28619, dtype: int64

In [22]:
def get_BERT_input(headlines, tokenizer):
  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  attention_masks = []
  encoded_dict = tokenizer.batch_encode_plus(
                      headlines,                      # Sentence to encode.
                      add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                      max_length = 65,           # Pad & truncate all sentences.
                      pad_to_max_length = True,
                      return_attention_mask = True,   # Construct attn. masks.
                      return_tensors = 'pt',     # Return pytorch tensors.
                )
  input_ids, attention_mask = encoded_dict['input_ids'], encoded_dict['attention_mask']
  # Add the encoded sentence to the list.    
  input_ids = encoded_dict['input_ids']
  
  # And its attention mask (simply differentiates padding from non-padding).
  attention_mask = encoded_dict['attention_mask']
  return torch.tensor(input_ids), torch.tensor(attention_mask)

In [23]:
get_BERT_input(["Ani draw eyeliner today","It is very hot today"],tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(tensor([[    0, 32340, 79442, 36298, 79852, 18925,     2,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1],
         [    0,  1650,    83,  4552,  8010, 18925,     2,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,  

In [24]:
# Dataset

In [25]:

class SatcasmDataset(Dataset):
    def __init__(self, df):
        self.headlines = df["headline"]
        self.model_truth_values = df["model_truth_values"]
        
    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        
        return self.headlines.iloc[idx], torch.tensor(self.model_truth_values.iloc[idx])

In [26]:
dataset = SatcasmDataset(df)

In [27]:
dataset[50]

('lgbt christians speak out: "love the sinner, hate the sin" won\'t cut it anymore',
 tensor([1, 0]))

In [28]:
train_dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

Pythorch model


In [29]:
from torch import nn

class SarcasmModel(nn.Module):
    def __init__(self):
        super(SarcasmModel, self).__init__()
        self.XLM = tokenizer = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        self.hidden_1 = nn.Linear(768,2)
        self.softmax = nn.Softmax(dim = 1)
        self.to_delete = 2
        

    def forward(self, b_input_ids, b_attention_mask):
        bert_output = self.XLM(b_input_ids, b_attention_mask)
        hidden_state = bert_output['last_hidden_state']
        sentence_vector = torch.mean(hidden_state, dim =  1)
        x = self.hidden_1(sentence_vector)
        probabilities = self.softmax(x)
        return probabilities




In [30]:
from torch.optim import AdamW
model = SarcasmModel()
model = model.to(device)
loss_function = nn.BCELoss().to(device)
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
batch_size = 32
for i_batch, (b_headlines, b_model_truth_values) in enumerate(train_dataloader):
  # print(step)
  # print((b_headlines[0]))
  # print((b_model_truth_values[0]))
  optimizer.zero_grad()
  b_input_ids, b_attention_mask = get_BERT_input(b_headlines,tokenizer)
  b_prediction = model(b_input_ids.to(device),b_attention_mask.to(device))
  loss = loss_function(b_prediction.to(device),b_model_truth_values.float().to(device))
  
  if i_batch % 10 == 0:
      iteration = i_batch*batch_size
      print("Iteration:", i_batch*batch_size, "Loss:", loss.data)
      batch_accuracy = torch.mean(torch.sum(b_prediction * b_model_truth_values.to(device), dim=1))
      print("Batch Accuracy:", batch_accuracy.data*100)
      if iteration % 5120 == 0:
        # torch.save(model.state_dict(), expt_folder + "SarcasmModel.pt")
        print("Saved Model")

  model.zero_grad()
  loss.backward()
  optimizer.step()
  

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Iteration: 0 Loss: tensor(0.6860, device='cuda:0')
Batch Accuracy: tensor(50.5546, device='cuda:0')
Saved Model
Iteration: 320 Loss: tensor(0.6877, device='cuda:0')
Batch Accuracy: tensor(50.2874, device='cuda:0')
Iteration: 640 Loss: tensor(0.6891, device='cuda:0')
Batch Accuracy: tensor(50.2291, device='cuda:0')
Iteration: 960 Loss: tensor(0.6063, device='cuda:0')
Batch Accuracy: tensor(56.3638, device='cuda:0')
Iteration: 1280 Loss: tensor(0.4533, device='cuda:0')
Batch Accuracy: tensor(64.5739, device='cuda:0')
Iteration: 1600 Loss: tensor(0.4918, device='cuda:0')
Batch Accuracy: tensor(64.8719, device='cuda:0')
Iteration: 1920 Loss: tensor(0.5194, device='cuda:0')
Batch Accuracy: tensor(63.3478, device='cuda:0')
Iteration: 2240 Loss: tensor(0.4883, device='cuda:0')
Batch Accuracy: tensor(67.1954, device='cuda:0')
Iteration: 2560 Loss: tensor(0.2356, device='cuda:0')
Batch Accuracy: tensor(82.3740, device='cuda:0')
Iteration: 2880 Loss: tensor(0.3126, device='cuda:0')
Batch Accurac

In [39]:
def predict(b_headline, tokenizer, model):
  b_input_ids, b_attention_mask = get_BERT_input(b_headline, tokenizer)
  b_predictions = model(b_input_ids.to(device), b_attention_mask.to(device))
  sarcastic_probability = b_predictions.data[0][1].item() * 100
  not_sarcastic_probability = b_predictions.data[0][0].item() * 100
  print_string = "Sarcastic:", f'{sarcastic_probability:.2f}', "Not Sarcastic:", f'{not_sarcastic_probability:.2f}'
  # return b_predictions
  return print_string

In [51]:
predict(["ես սիրում եմ մայրիկիս"],tokenizer, model)



('Sarcastic:', '2.08', 'Not Sarcastic:', '97.92')