In [11]:
!pip install simpletransformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
!wget https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz -P data
!tar -xvf data/paws_wiki_labeled_final.tar.gz -C data

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
--2021-08-11 10:06:01--  https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.142.128, 74.125.195.128, 2607:f8b0:400e:c09::80, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.142.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4687157 (4.5M) [application/gzip]
Saving to: ‘data/paws_wiki_labeled_final.tar.gz’


2021-08-11 10:06:01 (263 MB/s) - ‘data/paws_wiki_labeled_final.tar.gz’ saved [4687157/4687157]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `to

In [9]:
import warnings
import pandas as pd


def load_data(
    file_path, input_text_column, target_text_column, label_column, keep_label=1
):
    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "input_text", target_text_column: "target_text"}
    )
    df = df[["input_text", "target_text"]]
    df["prefix"] = "paraphrase"

    return df


def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

In [10]:
import os
from datetime import datetime
import logging

import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [12]:
# Google Data
train_df = pd.read_csv("/content/data/final/train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("/content/data/final/dev.tsv", sep="\t").astype(str)

train_df = train_df.loc[train_df["label"] == "1"]
eval_df = eval_df.loc[eval_df["label"] == "1"]

train_df = train_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
eval_df["prefix"] = "paraphrase"

#train_df = train_df.head(5000)
#eval_df = eval_df.head(500)
print(train_df,eval_df)

                                              input_text  ...      prefix
1      The NBA season of 1975 -- 76 was the 30th seas...  ...  paraphrase
3      When comparable rates of flow can be maintaine...  ...  paraphrase
4      It is the seat of Zerendi District in Akmola R...  ...  paraphrase
5      William Henry Henry Harman was born on 17 Febr...  ...  paraphrase
7      With a discrete amount of probabilities Formul...  ...  paraphrase
...                                                  ...  ...         ...
49384  The Romanesque language , Galician ( Galego ) ...  ...  paraphrase
49390  Note that k is a vector consisting of three in...  ...  paraphrase
49393  Tim Henman won in the final 6 -- 2 , 7 -- 6 , ...  ...  paraphrase
49395  He was considered an active member of the coun...  ...  paraphrase
49397  She was in Cork on June 24 and arrived on 8 Ju...  ...  paraphrase

[21829 rows x 3 columns]                                              input_text  ...      prefix
1     They we

In [None]:
model_args = Seq2SeqArgs()
model_args.do_sample = True
model_args.eval_batch_size = 32
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 2500
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False
model_args.learning_rate = 5e-5
model_args.max_length = 128
model_args.max_seq_length = 128
model_args.num_beams = None
model_args.num_return_sequences = 3
model_args.num_train_epochs = 1
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.top_k = 50
model_args.top_p = 0.95
model_args.train_batch_size = 8
model_args.use_multiprocessing = False
model_args.wandb_project = "Paraphrasing with BART"


model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,
)

model.train_model(train_df, eval_data=eval_df)

to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(eval_df["prefix"].tolist(), eval_df["input_text"].tolist())
]
truth = eval_df["target_text"].tolist()

preds = model.predict(to_predict)

# Saving the predictions if needed
os.makedirs("predictions", exist_ok=True)

with open(f"predictions/predictions_{datetime.now()}.txt", "w") as f:
    for i, text in enumerate(eval_df["input_text"].tolist()):
        f.write(str(text) + "\n\n")

        f.write("Truth:\n")
        f.write(truth[i] + "\n\n")

        f.write("Prediction:\n")
        for pred in preds[i]:
            f.write(str(pred) + "\n")
        f.write(
            "________________________________________________________________________________\n"
        )

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/21829 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model: Training started


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2021-08-11 10:07:55.680422: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[34m[1mwandb[0m: W&B syncing is set to `offline` in this directory.  Run `wandb online` or set WANDB_MODE=online to enable cloud syncing.


Running Epoch 0 of 1:   0%|          | 0/2729 [00:00<?, ?it/s]

In [None]:
import logging

from simpletransformers.seq2seq import Seq2SeqModel


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

model = Seq2SeqModel(
    encoder_decoder_type="bart", encoder_decoder_name="/content/outputs/checkpoint-625-epoch-1"
)


while True:
    original = input("Enter text to paraphrase: ")
    to_predict = [original]

    preds = model.predict(to_predict)

    print("---------------------------------------------------------")
    print(original)

    print()
    print("Predictions >>>")
    for pred in preds[0]:
        print(pred)

    print("---------------------------------------------------------")
    print()

2021-08-11 10:00:52.932148: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


Enter text to paraphrase: illiam Henry Henry Harman was born on 17 February 1828 in Waynesboro , Virginia , where his parents were Lewis and Sally ( Garber ) Harman .


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


---------------------------------------------------------
illiam Henry Henry Harman was born on 17 February 1828 in Waynesboro , Virginia , where his parents were Lewis and Sally ( Garber ) Harman .

Predictions >>>
Henry Harman was born on February 17, 1828 in Waynesboro, Virginia, where his parents were Lewis and Sally ( Garber ) Harman.
Henry Harman was born on February 17, 1828 in Waynesboro, Virginia, where his parents were Lewis and Sally ( Garber ) Harman.
Henry Harman was born on February 17, 1828 in Waynesboro, Virginia, where his parents were Lewis and Sally ( Garber ) Harman.
---------------------------------------------------------

Enter text to paraphrase: illiam Henry Henry Harman was born on 17 February 1828 in Waynesboro , Virginia , where his parents were Lewis and Sally ( Garber ) Harman .


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

---------------------------------------------------------
illiam Henry Henry Harman was born on 17 February 1828 in Waynesboro , Virginia , where his parents were Lewis and Sally ( Garber ) Harman .

Predictions >>>
Henry Harman was born on February 17, 1828 in Waynesboro, Virginia, where his parents were Lewis and Sally ( Garber ) Harman.
Henry Harman was born on February 17, 1828 in Waynesboro, Virginia, where his parents were Lewis and Sally ( Garber ) Harman.
Henry Harman was born on February 17, 1828 in Waynesboro, Virginia, where his parents were Lewis and Sally ( Garber ) Harman.
---------------------------------------------------------

Enter text to paraphrase: The Soviet Union maintained an embassy in Oslo and a consulate in Barentsburg , while Norway maintained a message in Moscow .


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

---------------------------------------------------------
The Soviet Union maintained an embassy in Oslo and a consulate in Barentsburg , while Norway maintained a message in Moscow .

Predictions >>>
The Soviet Union maintained an embassy in Oslo and a consulate in Barentsburg, while Norway maintained a message in Moscow.
The Soviet Union maintained an embassy in Oslo and a consulate in Barentsburg, while Norway maintained a message in Moscow.
The Soviet Union maintained an embassy in Oslo and a consulate in Barentsburg, while Norway maintained a message in Moscow.
---------------------------------------------------------

Enter text to paraphrase: The Río Blanco mine is a large copper mine located in the north of Peru in Loreto Region .


Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

---------------------------------------------------------
The Río Blanco mine is a large copper mine located in the north of Peru in Loreto Region .

Predictions >>>
The Río Blanco Mine is a large copper mine located in the north of Peru in Loreto Region.
The Río Blanco Mine is a large copper mine located in the north of Peru in Loreto Region.
The Río Blanco Mine is a large copper mine located in the north of Peru in Loreto Region.
---------------------------------------------------------

