In [1]:
import nltk

from datasets import load_metric
from transformers import Trainer,Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import EarlyStoppingCallback,ProgressCallback,PrinterCallback 

import numpy as np
import pandas as pd
import glob

import torch

from transformers import T5Tokenizer, T5ForConditionalGeneration

from tqdm.notebook import tqdm
tqdm.pandas()

from transformers import DataCollatorForSeq2Seq

from datasets import Dataset

In [2]:
from utils import *

In [3]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    bleu_result = bleu.compute(predictions=[x.split(' ') for x in decoded_preds], 
                               references=[[x.split(' ')] for x in decoded_labels])
    
    bleu_result.update({'precisions_1grams': bleu_result['precisions'][0]})
    del bleu_result['precisions']
    
    return  {key: round(value,4) for key, value in bleu_result.items()}

In [4]:
def preprocess_function(examples):    
    model_inputs = tokenizer(examples["text"], max_length=max_input_length, padding=padding,truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["str_label"], max_length=max_target_length, padding=padding,truncation=True)
    labels["input_ids"] = [
                    [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
                ]
            
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [5]:
train_pth = glob.glob('./Nordigen_DS_homework_2021/train.csv')[0]

train_df = pd.read_csv(train_pth)
train_df = train_df[~train_df['comment'].isna()]

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
tokenizer = T5Tokenizer.from_pretrained("mrm8488/t5-base-finetuned-sarcasm-twitter")

model = T5ForConditionalGeneration.from_pretrained("mrm8488/t5-base-finetuned-sarcasm-twitter", return_dict=True).to(device)
model.resize_token_embeddings(len(tokenizer))

Embedding(32100, 768)

In [8]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [9]:
train_df_head = train_df.head(10000)
train_df_head['scrcm'] = train_df_head['comment'].progress_apply(lambda x: eval_conversation(x,tokenizer,model))
train_df_head['scrcm'].value_counts()

  0%|          | 0/10000 [00:00<?, ?it/s]

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


<pad> derison           6899
<pad> normal</s>        2959
<pad> de de               17
<pad> Lol                  4
<pad> Exactly              3
                        ... 
<pad> de normal            1
<pad> appare               1
<pad> okay okay            1
<pad> Russell Wilson       1
<pad> Q Q                  1
Name: scrcm, Length: 114, dtype: int64

In [10]:
train_df_head['int_scrcm'] = train_df_head['scrcm'].astype("category").cat.rename_categories({"<pad> derison": 1, "<pad> normal</s>": 0})
train_df_head[train_df_head['int_scrcm'].isin([1,0])][['int_scrcm','label']].progress_apply(
    lambda row: row['int_scrcm']==row['label'],axis=1).value_counts()/10000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


  0%|          | 0/9858 [00:00<?, ?it/s]

True     0.5555
False    0.4303
dtype: float64

In [11]:
train_df_head['text'] = train_df_head['parent_comment']+train_df_head['comment']
train_df_head['scrcm'] = train_df_head['text'].progress_apply(lambda x: eval_conversation(x,tokenizer,model))
train_df_head['scrcm'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


  0%|          | 0/10000 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


<pad> derison         6930
<pad> normal</s>      3056
<pad> de de              3
<pad> EL                 1
<pad> UND                1
<pad> negative</s>       1
<pad> k                  1
<pad> ER                 1
<pad> WH                 1
<pad> oko                1
<pad> 2212               1
<pad> RI                 1
<pad> g                  1
<pad> IT IT              1
Name: scrcm, dtype: int64

In [12]:
train_df_head['int_scrcm'] = train_df_head['scrcm'].astype("category").cat.rename_categories({"<pad> derison": 1, "<pad> normal</s>": 0})
train_df_head[train_df_head['int_scrcm'].isin([1,0])][['int_scrcm','label']].progress_apply(
    lambda row: row['int_scrcm']==row['label'],axis=1).value_counts()/10000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


  0%|          | 0/9986 [00:00<?, ?it/s]

True     0.5800
False    0.4186
dtype: float64

In [9]:
train_df['text'] = train_df['parent_comment']+train_df['comment']

In [10]:
train_df['text'].apply(len).min()

2

In [11]:
train_df['text'].apply(len).median()

133.0

In [12]:
bleu = load_metric("bleu")

In [13]:
freeze_params(model.get_encoder())

# freeze_params(model.get_decoder())

In [14]:
tokenizer.tgt_lang = 'en_EN'

In [15]:
train_df['str_label'] = train_df['label'].astype("category").cat.rename_categories(
                    {1:"derison",  0:"normal"}).astype(str)

In [16]:
custom_dataset = Dataset.from_pandas(train_df[['text','str_label']].reset_index(drop=True))
custom_dataset.shuffle(seed=42)

Dataset({
    features: ['text', 'str_label'],
    num_rows: 909697
})

In [17]:
custom_dataset[0]

{'text': 'Need my Dodgerbros help returning fire at my coworker My coworker (Cardinals fan) keeps posting dumbass anti-Dodger memes on her desk in order to incite rage in me. Help me fight back with some Cardinals slander?""I like my shortstops how I like my beef... injected full of hormones""- picture of Delvin Perez and Jhonny Peralta and a roided out bull/cow.',
 'str_label': 'normal'}

In [18]:
ds_len = int(train_df.shape[0]/2)

In [19]:
train_samples = int(ds_len*0.8)
test_samples = int(ds_len*0.2)

In [20]:
train_dataset = custom_dataset.select(range(train_samples))
train_dataset

Dataset({
    features: ['text', 'str_label'],
    num_rows: 363878
})

In [21]:
test_dataset = custom_dataset.select(range(train_samples,ds_len))
test_dataset

Dataset({
    features: ['text', 'str_label'],
    num_rows: 90970
})

In [22]:
max_length=int(train_df['text'].apply(len).median()*2)
min_length=30

max_target_length = 10

max_input_length = max_length
padding =  'max_length'
label_pad_token_id = -100

batch_size = 32
num_train_epochs=10

In [23]:
full_train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names)
full_eval_dataset = test_dataset.map(
    preprocess_function, 
    batched=True,
    remove_columns=test_dataset.column_names)

  0%|          | 0/364 [00:00<?, ?ba/s]

  0%|          | 0/91 [00:00<?, ?ba/s]

In [24]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model.to(device),
    label_pad_token_id=label_pad_token_id)

In [25]:
early_stop = EarlyStoppingCallback(early_stopping_patience = 5,early_stopping_threshold = 0)

In [26]:
training_args = Seq2SeqTrainingArguments(
    "checkpoints",
    num_train_epochs=num_train_epochs,
    do_eval = True,
    evaluation_strategy = "epoch",
#     eval_steps = 1000,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=1000, # number of warmup steps for learning rate scheduler
    weight_decay=0.01,
    logging_dir='./logs', # directory for storing logs
    logging_strategy='steps',
    logging_steps=1000,
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    save_total_limit=3,
    save_strategy = 'epoch',
    predict_with_generate=True,
    fp16=True,
    seed = 42,
    load_best_model_at_end = True,
#     disable_tqdm=True
)

In [27]:
trainer = Seq2SeqTrainer(
    model=model.to(device), # the instantiated :hugs: Transformers model to be trained
    args=training_args, # training arguments, defined above
    train_dataset=full_train_dataset, # training dataset
    eval_dataset=full_eval_dataset,# evaluation dataset
    data_collator = data_collator,
    callbacks = [early_stop,PrinterCallback()],
    tokenizer=tokenizer,
    compute_metrics= compute_metrics
) 

Using amp fp16 backend


In [None]:
trainer.evaluate()

trainer.train()

In [29]:
trainer.save_model("./model")
tokenizer.save_pretrained("./tokenizer")
torch.save(training_args, 'training_args.bin')

Saving model checkpoint to ./model
Configuration saved in ./model/config.json
Model weights saved in ./model/pytorch_model.bin
tokenizer config file saved in ./model/tokenizer_config.json
Special tokens file saved in ./model/special_tokens_map.json
Copy vocab file to ./model/spiece.model
tokenizer config file saved in ./tokenizer/tokenizer_config.json
Special tokens file saved in ./tokenizer/special_tokens_map.json
Copy vocab file to ./tokenizer/spiece.model


In [30]:
del trainer,data_collator,full_train_dataset,full_eval_dataset
del model, tokenizer
torch.cuda.empty_cache()
torch.cuda.synchronize()

In [31]:
tokenizer = T5Tokenizer.from_pretrained("./tokenizer")
model = T5ForConditionalGeneration.from_pretrained("./model").to(device)

Didn't find file ./tokenizer/added_tokens.json. We won't load it.
Didn't find file ./tokenizer/tokenizer.json. We won't load it.
loading file ./tokenizer/spiece.model
loading file None
loading file ./tokenizer/special_tokens_map.json
loading file ./tokenizer/tokenizer_config.json
loading file None
loading configuration file ./model/config.json
Model config T5Config {
  "_name_or_path": "mrm8488/t5-base-finetuned-sarcasm-twitter",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "

In [32]:
train_df_head = train_df.head(10000)

In [33]:
train_df_head['scrcm'] = train_df_head['text'].progress_apply(lambda x: eval_conversation(x,tokenizer,model))

  0%|          | 0/10000 [00:00<?, ?it/s]

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [34]:
train_df_head['scrcm'].value_counts()

<pad> normal</s>    5576
<pad> derison       4424
Name: scrcm, dtype: int64

In [35]:
train_df_head['int_scrcm'] = train_df_head['scrcm'].astype("category").cat.rename_categories({"<pad> derison": 1, "<pad> normal</s>": 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [36]:
train_df_head[train_df_head['int_scrcm'].isin([1,0])][['int_scrcm','label']].progress_apply(
    lambda row: row['int_scrcm']==row['label'],axis=1).value_counts()

  0%|          | 0/10000 [00:00<?, ?it/s]

True     7054
False    2946
dtype: int64