In [1]:
!pip install simpletransformers



In [2]:
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs
import logging
import pandas as pd
from google.colab import drive

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data_path = "/content/drive/MyDrive/544_project_data/toxic_span_text_pairs.csv"
data = pd.read_csv(data_path)
data

Unnamed: 0,original,censored
0,Yet call out all Muslims for the acts of a few...,Yet call out all Muslims for the acts of a few...
1,This bitch is nuts. Who would read a book by a...,<c>
2,You're an idiot.,You're an <c>.
3,"Nincompoop, that's a nice one! I'm partial to ...","<c>, <c> a nice one! I'm partial to <c>."
4,testing purposes: \n\nyou are an idiot and i c...,testing purposes: \n\nyou are an <c> and i can...
...,...,...
15494,"Ah, the small minded, short sighted incompeten...","Ah, the <c>, <c> <c> coming out. \nMore jobs, ..."
15495,"For some of these demonic possessed brats, a s...","For some of these demonic possessed <c>, a str..."
15496,Butts was stupid enough to lay out the charade...,<c> enough to lay out the charade for Bannon -...
15497,This dirtbag is no Marine; he's a sadistic ter...,This <c> is no Marine; he's a <c>.


In [5]:
#rename columns and manually split train and test 
data = data.rename(columns={"original": "input_text", "censored":"target_text"})

train = data[:12400]
train_5000 = train.sample(5000)
test = data[12400:]
test_sample = test.sample(100)

In [6]:
#intialize bart model for training
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

model_args = Seq2SeqArgs()

#following part is changed
model_args.evaluate_during_training = False
#model_args.eval_batch_size = 64
#model_args.evaluate_during_training_steps = 2500
#model_args.evaluate_during_training_verbose = True

model_args.fp16 = False
model_args.learning_rate = 5e-5   #5e-5
model_args.max_seq_length = 128
model_args.num_train_epochs = 4
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = False
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.train_batch_size = 4
model_args.use_multiprocessing = False

model_args.do_sample = True
model_args.num_beams = None
model_args.num_return_sequences = 1
model_args.max_length = 60
model_args.top_k = 50
model_args.top_p = 0.95

model_args.wandb_project = "Comment censoring with BART"


model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,
    use_cuda=True
)

In [7]:
#train the model on training set
model.train_model(train_5000)

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/5000 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model: Training started


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

[34m[1mwandb[0m: Currently logged in as: [33mssimaizz[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running Epoch 0 of 4:   0%|          | 0/1250 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/checkpoint-1250-epoch-1


Running Epoch 1 of 4:   0%|          | 0/1250 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/checkpoint-2500-epoch-2


Running Epoch 2 of 4:   0%|          | 0/1250 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/checkpoint-3750-epoch-3


Running Epoch 3 of 4:   0%|          | 0/1250 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/checkpoint-5000-epoch-4
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/
INFO:simpletransformers.seq2seq.seq2seq_model: Training of facebook/bart-large model complete. Saved to outputs/.


(5000, 0.22985084019294008)

In [8]:
test_sample_output = model.predict(list(test_sample["input_text"]))
print(test_sample_output)

Generating outputs:   0%|          | 0/13 [00:00<?, ?it/s]

["Exactly. In 2008, and again in 2012, America's voters could not have been more <c>.", 'Nope, only the <c> voters would believe those lies.', "That's what happened when all the 'politically correct' people elected a <c>, who in himself, hated OUR country.  Notice most of the protesters are black.  If you don't like this country, move to North Korea....you should have one hell of a life", 'No, he was a loud-mouthed <c> on the campaign trail, too. And people still voted for him. Amazing!', 'Go blow your money at a casino instead. <c>.', 'There is nothing wrong with defending an innocent man and political accusations are the worst form of “evidence”.   \nA true pedofile or molester cannot control their impulses and have the highest recidivism rates amongst all criminals.  According to all his �', '"Unlike the 60 million American GOP voters, Germans are generally not <c>."\nIs that you Hillary?', 'Your not understaning grammer, <c>. Your a Leftie poster with no comon senze.', '<c> having 

In [9]:
test_sample["generated_text"] = test_sample_output

In [10]:
test_sample.to_csv("/content/drive/MyDrive/544_project_data/sample_output_fully_trained_model.csv", index=False)