# Login and data download

In [None]:
!pip install aicrowd-cli
!mkdir data

# Utility function and constants

In [None]:
def preprocess_function(sample):
  
    text = sample["text"]
    label = sample["label"]

    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=MAX_TEXT_LENGTH)
    outputs = tokenizer(label, padding="max_length", truncation=True, max_length=MAX_LABEL_LENGTH)


    sample["input_ids"] = inputs.input_ids
    sample["attention_mask"] = inputs.attention_mask
    sample["decoder_input_ids"] = outputs.input_ids
    sample["decoder_attention_mask"] = outputs.attention_mask
    sample["labels"] = outputs.input_ids

    sample["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in sample["labels"]]

    return sample

In [None]:
def generate_predictions(batch):

    inputs = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=MAX_TEXT_LENGTH, return_tensors="pt")
    
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["predictions"] = output_str

    return batch

In [None]:
MAX_TEXT_LENGTH = 150
MAX_LABEL_LENGTH = 150

# Import

In [None]:
import pandas as pd
import numpy as np
import os

import torch
import datasets
from datasets import load_dataset
from transformers import EncoderDecoderModel, EncoderDecoderConfig, BertTokenizerFast, Seq2SeqTrainingArguments, Seq2SeqTrainer, BertConfig

# Data preprocessing

In [None]:
train_dataset = pd.read_csv("data/train.csv")
validation_dataset = pd.read_csv("data/val.csv")
test_dataset = pd.read_csv("data/test.csv")

In [None]:
dataset = load_dataset('csv', data_files={"train"     : ["data/train.csv"], 
                                          "validation": ["data/val.csv"], 
                                          "test"      : ["data/test.csv"]})

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [None]:
BATCH_SIZE = 16
      
tokenized_datasets = dataset.map(preprocess_function, batch_size=BATCH_SIZE, batched=True)

In [None]:
tokenized_datasets.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# Define model

In [None]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

In [None]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

In [None]:
N_EPOCHS = 10

args = Seq2SeqTrainingArguments(
    "Scambled Text",
    evaluation_strategy = "epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=N_EPOCHS,
    fp16=True,
    save_strategy="epoch",
    save_total_limit=5,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Train model

In [None]:
trainer.train()

# Prediction

In [None]:
results = dataset['test'].map(generate_predictions, batched=True, batch_size=16)
test_dataset['label'] = results['predictions']

# Make submission

In [None]:
!mkdir assets
test_dataset.to_csv(os.path.join("assets", "submission.csv"), index=False)

In [None]:
!aicrowd notebook submit -c deshuffling-text -a assets --no-verify