In [42]:
import transformers
from datasets import Dataset, DatasetDict
from evaluate import load
import numpy as np
import pandas as pd
import torch
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danii\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
data = pd.read_csv('train.csv')
metric = load('rouge')
model_checkpoints = 'facebook/bart-large-xsum'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoints)

In [9]:
len(tokenizer.encode(data.abstract.max(), return_tensors='pt')[0])

186

In [10]:
len(tokenizer.encode(data.title.max(), return_tensors='pt')[0])

19

In [43]:
train = Dataset.from_pandas(data[:125000])
val = Dataset.from_pandas(data[125000:])
# test = Dataset.from_pandas(data[130000:])

data = DatasetDict({'train':train, 'validation':val})

In [44]:
data

DatasetDict({
    train: Dataset({
        features: ['abstract', 'title'],
        num_rows: 120000
    })
    validation: Dataset({
        features: ['abstract', 'title'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['abstract', 'title'],
        num_rows: 5000
    })
})

In [11]:
max_input = 256
max_target = 32

In [52]:
def preprocess_data(data_to_process):
    inputs = [text for text in data_to_process['abstract']]
    model_inputs = tokenizer(inputs, max_length=max_input, padding='max_length', truncation=True)

    with tokenizer.as_target_tokenizer():
        targets = tokenizer(data_to_process['title'], max_length=max_target, padding='max_length', truncation=True)
    
    model_inputs['labels'] = targets['input_ids']
    return model_inputs

In [53]:
tokenized_data = data.map(preprocess_data, batched=True, remove_columns=['abstract', 'title'])

  0%|          | 0/120 [00:00<?, ?ba/s]



  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [54]:
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [55]:
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [56]:
batch_size = 12

In [57]:
def compute_rouge(pred):
    predictions, labels = pred
    decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
    res = {key: value*100 for key, value in res.items()}

    pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    res['gen_len'] = np.mean(pred_lens)

    return {k: round(v, 4) for k,v in res.items()}

In [58]:
args = transformers.Seq2SeqTrainingArguments(
    'conversation-summ',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    eval_accumulation_steps=1
)

In [59]:
trainer = transformers.Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

In [60]:
trainer.train()

***** Running training *****
  Num examples = 120000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 5625
  Number of trainable parameters = 406290432


  0%|          | 0/5625 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

In [None]:
trainer.save_model("./my_model")

In [None]:
from transformers import DistilBertConfig, DistilBertModel
path = 'path_to_my_model'
model = DistilBertModel.from_pretrained(path)