#Install Libraries

In [None]:
!pip install simpletransformers datasets tqdm pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.7-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 7.1 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 68.0 MB/s 
Collecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 58.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 55.5 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.1 MB/s 
Collecting transformers>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |██████

In [None]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from simpletransformers.t5 import T5Model
from sklearn.model_selection import train_test_split
import sklearn

#Load Dataset

In [None]:
dataset = load_dataset('HHousen/ParaSCI')

Using custom data configuration HHousen--ParaSCI-cf47fefb47dc491e


Downloading and preparing dataset csv/HHousen--ParaSCI to /root/.cache/huggingface/datasets/HHousen___csv/HHousen--ParaSCI-cf47fefb47dc491e/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.77M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/73.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/585k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/622k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/616k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/851k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/HHousen___csv/HHousen--ParaSCI-cf47fefb47dc491e/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def process_tapaco_dataset(dataset, out_file):
    tapaco = []
    # The dataset has only train split.
    for data in tqdm(dataset["train"]):
        keys = data.keys()
        tapaco.append([data[key] for key in keys])
    tapaco_df = pd.DataFrame(
        data=tapaco,
        columns=[
            "sentence1",
            "sentence2"
        ],
    )
    tapaco_df.to_csv(out_file, sep="\t", index=None)
    return tapaco_df

In [None]:
tapaco_df = process_tapaco_dataset(dataset,"tapaco_huggingface.csv")

100%|██████████| 338717/338717 [00:22<00:00, 14825.98it/s]


In [None]:
tapaco_df.head()

Unnamed: 0,sentence1,sentence2
0,"for all methods , the tweets were tokenized wi...",the tweets were tokenized and part-ofspeech ta...
1,it was shown by nederhof et al that prefix pro...,"nederhof et al , for instance , show that pref..."
2,"first , kikuchi et al proposed a new long shor...","first , kikuchi et al tried to control the len..."
3,with word confusion networks further improves ...,the complexity is dominated by the word confus...
4,fofe can model the word order in a sequence ba...,fofe can model the word order in a sequence us...


In [None]:
tapaco_df.rename(columns = {'sentence1':'input_text', 'sentence2':'target_text'}, inplace = True)

In [None]:
tapaco_df["prefix"] = "paraphrase"

In [None]:
train_data,test_data = train_test_split(tapaco_df,test_size=0.1)

In [None]:
train_data

Unnamed: 0,input_text,target_text,prefix
103733,"in particular , several hard combinatorial pro...","more specifically , many np-hard graph problem...",paraphrase
287777,low-density parity-check codes were first prop...,the first well-known example is the lowdensity...,paraphrase
314243,the particles are detected to subpixel accurac...,the position and trajectories of the particle ...,paraphrase
171835,these statistics resemble what is observed for...,this resembles the statistics found in the exi...,paraphrase
201449,convolutional neural networks have achieved su...,deep convolutional neural networks achieve imp...,paraphrase
...,...,...,...
156343,deep learning has led to significant improveme...,"in recent years , deep learning methods have s...",paraphrase
220000,if one chooses the cosmological horizon as the...,"conversely , if the role of the horizons is in...",paraphrase
24003,text categorization is the task of automatical...,text categorization is the task of classifying...,paraphrase
21180,sentence compression is a text-to-text generat...,sentence compression is the task of producing ...,paraphrase


#Model Definition

In [None]:
args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "max_seq_length": 256,
    "num_train_epochs": 2,
    "num_beams": None,
    "do_sample": True,
    "top_k": 50,
    "top_p": 0.95,
    "use_multiprocessing": False,
    "save_steps": -1,
    "save_eval_checkpoints": True,
    "evaluate_during_training": False,
    'adam_epsilon': 1e-08,
    'eval_batch_size': 6,
    'fp_16': False,
    'gradient_accumulation_steps': 16,
    'learning_rate': 0.0003,
    'max_grad_norm': 1.0,
    'n_gpu': 1,
    'seed': 42,
    'train_batch_size': 6,
    'warmup_steps': 0,
    'weight_decay': 0.0
}

In [None]:
model = T5Model("t5","t5-small", args=args)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
model.train_model(train_data, eval_data=test_data, use_cuda=True,acc=sklearn.metrics.accuracy_score)

  0%|          | 0/304845 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/50808 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/50808 [00:00<?, ?it/s]

(6350, 1.8858970098458523)

#Testing Model Once it is Ready

In [None]:
import os

In [None]:
root_dir = os.getcwd()
trained_model_path = os.path.join(root_dir,"outputs")

In [None]:
args = {
    "overwrite_output_dir": True,
    "max_seq_length": 256,
    "max_length": 50,
    "top_k": 50,
    "top_p": 0.95,
    "num_return_sequences": 5,
}

In [None]:
trained_model = T5Model("t5",trained_model_path,args=args)

In [None]:
prefix = "paraphrase"
pred = trained_model.predict([f"{prefix}: The house will be cleaned by me every Saturday."])
print(pred)

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/5 [00:00<?, ?it/s]

[['the house will be cleaned by me every Saturday.', 'the house will be cleaned every 16 and 16 by me.', 'the house will be cleaned by me every Saturday.', 'the whole house will be cleaned by me every Sunday.', 'the y hsah will be cleaned every Saturday.']]
