<a href="https://colab.research.google.com/github/Athugodage/RuLawSimplification/blob/main/t5_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Athugodage/RuLawSimplification.git

In [None]:
import json

with open('/content/RuLawSimplification/two_way/two_way.json', 'r', encoding='utf8') as rp:
    a = json.load(rp)

In [None]:
a[100]

In [None]:
import pandas as pd

df = pd.DataFrame({'Text': [], 'Comment': []})


for article in a:
    for sujet in article['article text']:
        df = df['Text'].append({'Text': sujet['text'], 'Comment': sujet['comment']}, ignore_index=True)


In [None]:
df

In [None]:
df.to_csv('two_way_trial.csv', index=False)

In [None]:
df.shape

In [None]:
df.iloc[:2500].to_csv('train.csv', index=False, header=None)
df.iloc[2500:2800].to_csv('validation.csv', index=False, header=None)
df.iloc[2800:].to_csv('test.csv', index=False, header=None)

In [None]:
!pip install Sentencepiece
!pip install transformers

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install datasets

In [None]:
pd.read_csv('test.csv', encoding='utf8', sep=',', index_col=False, header=None)

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from datasets import load_dataset


dataset = load_dataset('csv', 
                       data_files={'train': ['train.csv'],
                                   'validation': ['validation.csv'],
                                   'test': ['test.csv']},
                       encoding='utf8',
                       index_col = False,
                       header=None,
                       sep=','
                      )

In [None]:
print(dataset['train'])
print(dataset['validation'])
print(dataset['test'])

In [None]:
dataset.shape

In [None]:
dataset.push_to_hub('marcus2000/twowaydata')

In [None]:
from datasets import load_dataset

dataset = load_dataset('marcus2000/twowaydata')

In [None]:
dataset['train']

In [None]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(dataset["train"], 2)

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoConfig


tokenizer = AutoTokenizer.from_pretrained("t5-base")

# "bert-base-cased"
# DeepPavlov/rubert-base-cased-conversational


In [None]:
max_input_length = 100
max_target_length = 100


def preprocess_function(examples):
    inputs = [ex for ex in examples['0']]  # 0 - Текст
    targets = [ex for ex in examples['1']]  # 1 - Комментарий
    model_inputs = tokenizer(inputs, 
                             max_length=max_input_length,
                             truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_datasets

In [None]:
for x in tokenized_datasets['train']:
    print(x['0'], x['input_ids'])

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

batch_size = 16

args = Seq2SeqTrainingArguments("args",
    evaluation_strategy = "epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    push_to_hub=True,
    predict_with_generate=True,
    fp16=True
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from datasets import load_metric


metric = load_metric("sacrebleu")

In [None]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

In [None]:
trainer.train()

In [None]:
predidcted = trainer.predict(tokenized_datasets['test'])

In [None]:
predidcted

In [None]:
trainer.push_to_hub("marcus2000/seq2seq_04.09.2022")

In [None]:
!pip install transformers

In [None]:
from transformers import pipelines

model = pipelines(model="marcus2000/seq2seq_04.09.2022")